pax_global_header00006660000000000000000000000064150223521560014513gustar00rootroot0000000000000052 comment=5a0c860f8a50be03704c7f900949511e89f6d718 rocPRIM-rocm-6.4.3/000077500000000000000000000000001502235215600137365ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.azuredevops/000077500000000000000000000000001502235215600163635ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.azuredevops/rocm-ci.yml000066400000000000000000000014051502235215600204370ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .gitlab - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .gitlab - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/rocPRIM.yml@pipelines_repo rocPRIM-rocm-6.4.3/.clang-format000066400000000000000000000114351502235215600163150ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 UseCRLF: false # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: true AlignArrayOfStructures: Right AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true AfterExternBlock: false BeforeCatch: true BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: All BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: true IndentExternBlock: NoIndent IndentPPDirectives: BeforeHash IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PPIndentWidth: -1 PackConstructorInitializers: NextLine PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left QualifierAlignment: Leave ReferenceAlignment: Pointer ReflowComments: false ShortNamespaceLines: 0 SortIncludes: CaseSensitive SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false AttributeMacros: - __host__ - __device__ - __global__ - __forceinline__ - __shared__ - __launch_bounds__ - ROCPRIM_DEVICE - ROCPRIM_HOST - ROCPRIM_HOST_DEVICE - ROCPRIM_SHARED_MEMORY - ROCPRIM_KERNEL - ROCPRIM_INLINE - ROCPRIM_FORCE_INLINE - ROCPRIM_LAUNCH_BOUNDS # Trick clang into thinking that our C-style attributes are C++-style attributes # Make sure that the sizes line up for linebreaks etc Macros: - __host__=[[host]] - __device__=[[device]] - __global__=[[global]] - __forceinline__=[[forceinline]] - __shared__=[[shared]] - __launch_bounds__(x)=[[launch_bounds(x)]] - __attribute__(x)=[[attribute(x)]] - ROCPRIM_DEVICE=[[DEVICE____]] - ROCPRIM_HOST=[[HOST____]] - ROCPRIM_HOST_DEVICE=[[HOST_DEVICE____]] - ROCPRIM_SHARED_MEMORY=[[SHARED_MEMORY____]] - ROCPRIM_KERNEL=[[KERNEL____]] - ROCPRIM_INLINE=[[INLINE____]] - ROCPRIM_FORCE_INLINE=[FORCE_INLINE____]] - ROCPRIM_LAUNCH_BOUNDS(x)=[[launch_bounds(x)____]] BreakAfterAttributes: Always --- rocPRIM-rocm-6.4.3/.githooks/000077500000000000000000000000001502235215600156435ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.githooks/install000077500000000000000000000002121502235215600172320ustar00rootroot00000000000000#!/bin/sh cd "$(git rev-parse --git-dir)" cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" rocPRIM-rocm-6.4.3/.githooks/pre-commit000077500000000000000000000015531502235215600176510ustar00rootroot00000000000000#!/bin/sh # Redirect output to stderr. exec 1>&2 check_failed=false # Do the code format check if ! "$(git rev-parse --show-toplevel)/scripts/code-format/check-format.sh" HEAD --cached 1>&2; then printf "\n\033[31mFailed\033[0m: code format check.\n" check_failed=true fi # Do the copyright check # update & apply copyright when hook config is set, otherwise just verify opts="-qc" if [ "$(git config --get --type bool --default false hooks.updateCopyright)" = "true" ]; then opts="-qca" fi if ! "$(git rev-parse --show-toplevel)/scripts/copyright-date/check-copyright.sh" "$opts" 1>&2; then printf "\n\033[31mFailed\033[0m: copyright date check.\n" check_failed=true fi if $check_failed; then printf " Pre-commit check failed, please fix the reported errors. Note: Use '\033[33mgit commit --no-verify\033[0m' to bypass checks.\n" exit 1 fi rocPRIM-rocm-6.4.3/.github/000077500000000000000000000000001502235215600152765ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.github/CODEOWNERS000077500000000000000000000003251502235215600166740ustar00rootroot00000000000000* @stanleytsang-amd @umfranzw @RobsonRLemos @lawruble13 # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation rocPRIM-rocm-6.4.3/.github/ISSUE_TEMPLATE/000077500000000000000000000000001502235215600174615ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000022571502235215600221610ustar00rootroot00000000000000--- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Install '...' version '...' 2. Run '...' with data '...' 3. See error on logfile '...' **Expected behavior** A clear and concise description of what you expected to happen. **Log-files** Add *full* logfiles to help explain your problem. **Environment** Make sure that ROCm is correctly installed and run the following command: ``` printf '=== environment\n' > environment.txt && printf '\n\n=== date\n' >> environment.txt && date >> environment.txt && printf '\n\n=== Linux Kernel\n' >> environment.txt && uname -a >> environment.txt && printf '\n\n=== rocm-smi' >> environment.txt && rocm-smi >> environment.txt && printf '\n\n' >> environment.txt && hipconfig >> environment.txt && printf '\n\n=== rocminfo\n' >> environment.txt && rocminfo >> environment.txt && printf '\n\n=== lspci VGA\n' >> environment.txt && lspci | grep -i vga >> environment.txt ``` Attach `environment.txt` **Additional context** Add any other context about the problem here. rocPRIM-rocm-6.4.3/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000011231502235215600232030ustar00rootroot00000000000000--- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. rocPRIM-rocm-6.4.3/.github/dependabot.yml000066400000000000000000000012231502235215600201240ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" rocPRIM-rocm-6.4.3/.github/workflows/000077500000000000000000000000001502235215600173335ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.github/workflows/docs.yaml000066400000000000000000000045551502235215600211600ustar00rootroot00000000000000name: Upload to the upload server # Controls when the workflow will run on: push: branches: [develop, master] tags: - rocm-5.* release: types: [published] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: getting branch name shell: bash run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: branch_name - name: getting tag name shell: bash run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})" id: tag_name - name: zipping files run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*' - name: echo-step run: echo "${{ github.event.release.target_commitish }}" - name: uploading archive to prod if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.PROD_UPLOAD_URL }}' args: '-o ConnectTimeout=5' - name: uploading archive to staging if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.STG_UPLOAD_URL }}' args: '-o ConnectTimeout=5' rocPRIM-rocm-6.4.3/.gitignore000066400000000000000000000012211502235215600157220ustar00rootroot00000000000000### Build dirs ### build*/ ### clangd. ### /.cache # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake CMakeUserPresets.json # End of https://www.gitignore.io/api/c++,cmake # VS Code # .vscode # Python __pycache__ rocPRIM-rocm-6.4.3/.gitignore.develop000066400000000000000000000012401502235215600173600ustar00rootroot00000000000000### Build dirs ### build/ ### Docs dirs ### doc/html/ doc/xml/ doc/latex/ doc/*.tag # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build ### Gtilab CI ### .gitlab-ci-gputest.yml # End of https://www.gitignore.io/api/c++,cmake rocPRIM-rocm-6.4.3/.gitlab-ci.yml000066400000000000000000000451461502235215600164040ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. include: - project: 'amd/ci-templates' ref: main file: - /defaults.yaml - /deps-cmake.yaml - /deps-docs.yaml - /deps-format.yaml - /deps-rocm.yaml - /deps-vcpkg.yaml - /deps-windows.yaml - /deps-compiler-acceleration.yaml - /gpus-rocm.yaml - /rules.yaml stages: - lint - autotune - build - test - benchmark variables: PACKAGE_DIR: $BUILD_DIR/package AUTOTUNE_CONFIG_DIR: ${CI_PROJECT_DIR}/autotune_config clang-format: extends: - .lint:clang-format copyright-date: extends: - .deps:rocm stage: lint needs: [] tags: - build rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: - cd $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR - scripts/copyright-date/check-copyright.sh -v -d $CI_MERGE_REQUEST_DIFF_BASE_SHA .cmake-minimum-vcpkg: extends: - .deps:rocm - .deps:cmake-minimum - .deps:vcpkg - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:vcpkg", before_script] - !reference [".deps:compiler-acceleration", before_script] - $VCPKG_DIR/vcpkg install gtest benchmark .cmake-minimum-apt: extends: - .deps:rocm - .deps:cmake-minimum - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:compiler-acceleration", before_script] - $SUDO_CMD apt-get install -y -qq libgtest-dev libbenchmark-dev .build:vcpkg-apt: stage: build tags: - build extends: - .gpus:rocm-gpus - .rules:build # Missing -Werror and other diagnostic flags due to rocm-terminal sporting an old googletest APT package (Ubuntu 18.04). # Here we're only testing the consumption logic, and we want to avoid new errors breaking logic testing script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release "$(if [ -n "$VCPKG_DIR" ]; then echo "-DCMAKE_TOOLCHAIN_FILE=$VCPKG_DIR/scripts/buildsystems/vcpkg.cmake"; fi)" -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D BUILD_BENCHMARK=ON -D GPU_TARGETS=$GPU_TARGETS -D AMDGPU_TEST_TARGETS=$GPU_TARGETS -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD=14 -S $CI_PROJECT_DIR -B $BUILD_DIR - cmake --build $BUILD_DIR --target test_basic build:cmake-minimum-vcpkg: stage: build needs: [] extends: - .cmake-minimum-vcpkg - .build:vcpkg-apt build:cmake-minimum-apt: stage: build needs: [] extends: - .cmake-minimum-apt - .build:vcpkg-apt .cmake-latest: extends: - .deps:rocm - .deps:cmake-latest - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-latest", before_script] - !reference [".deps:compiler-acceleration", before_script] .cmake-minimum: extends: - .deps:rocm - .deps:cmake-minimum - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:compiler-acceleration", before_script] .build:common: stage: build tags: - build extends: - .gpus:rocm-gpus - .rules:build variables: EXTRA_CMAKE_CXX_FLAGS: "" script: - mkdir -p $BUILD_DIR - cd $BUILD_DIR - | # Add hardened libc++ assertions for tests only if [[ $BUILD_TARGET == "TEST" ]]; then echo "Configuring with hardened libc++!" EXTRA_CMAKE_CXX_FLAGS+=" -D_GLIBCXX_ASSERTIONS=ON" fi - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror $EXTRA_CMAKE_CXX_FLAGS" -D CMAKE_BUILD_TYPE="$BUILD_TYPE" -D BUILD_$BUILD_TARGET=ON -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D BUILD_EXAMPLE=ON -D GPU_TARGETS=$GPU_TARGETS -D AMDGPU_TEST_TARGETS=$GPU_TARGETS -D CMAKE_CXX_STANDARD="$BUILD_VERSION" -S $CI_PROJECT_DIR -B $BUILD_DIR - cmake --build $BUILD_DIR artifacts: paths: - $BUILD_DIR/.ninja_log - $BUILD_DIR/benchmark/* - $BUILD_DIR/CMakeCache.txt - $BUILD_DIR/CTestTestfile.cmake - $BUILD_DIR/deps/googlebenchmark/ - $BUILD_DIR/gtest/ - $BUILD_DIR/test/CTestTestfile.cmake - $BUILD_DIR/test/rocprim/CTestTestfile.cmake - $BUILD_DIR/test/rocprim/test_* - $BUILD_DIR/test/test_* expire_in: 2 weeks build:cmake-latest: stage: build needs: [] extends: - .cmake-latest - .build:common parallel: # Debug builds disabled due to excessive build times for debug test builds matrix: - BUILD_TYPE: Release BUILD_TARGET: [BENCHMARK, TEST] BUILD_VERSION: [14, 17] build:cmake-minimum: needs: [] extends: - .cmake-minimum - .build:common parallel: matrix: - BUILD_TYPE: [Debug, Release] BUILD_TARGET: [BENCHMARK, TEST] BUILD_VERSION: 14 build:package: stage: build needs: [] tags: - build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:build script: - mkdir -p $PACKAGE_DIR - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D CMAKE_CXX_STANDARD=14 -B $PACKAGE_DIR -S $CI_PROJECT_DIR - cd $PACKAGE_DIR - cpack -G "DEB;ZIP" artifacts: paths: - $PACKAGE_DIR/rocprim*.deb - $PACKAGE_DIR/rocprim*.zip expire_in: 2 weeks build:windows: stage: build needs: [] extends: - .rules:build - .gpus:rocm-windows - .deps:rocm-windows - .deps:visual-studio-devshell parallel: matrix: - BUILD_TYPE: [Debug, Release] BUILD_TARGET: [BENCHMARK, TEST] script: - mkdir -p $CI_PROJECT_DIR/build - cmake -G Ninja -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build -D BUILD_$BUILD_TARGET=ON -D GPU_TARGETS=$GPU_TARGET -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}" -D CMAKE_BUILD_TYPE="$BUILD_TYPE" -D CMAKE_CXX_STANDARD=14 - cmake --build "$CI_PROJECT_DIR/build" artifacts: paths: - $CI_PROJECT_DIR/build/test/test_* - $CI_PROJECT_DIR/build/test/rocprim/test_* - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake - $CI_PROJECT_DIR/build/test/rocprim/CTestTestfile.cmake - $CI_PROJECT_DIR/build/gtest/ - $CI_PROJECT_DIR/build/CMakeCache.txt - $CI_PROJECT_DIR/build/.ninja_log - $CI_PROJECT_DIR/build/CTestTestfile.cmake expire_in: 2 weeks autotune:build: stage: autotune needs: [] tags: - build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:benchmark before_script: - !reference [".cmake-minimum", before_script] - $SUDO_CMD apt-get update -qq - $SUDO_CMD apt-get install -qq -y zstd variables: BENCHMARK_TARGETS: benchmark_config_tuning script: - mkdir -p $BUILD_DIR - cd $BUILD_DIR - 'printf "Building benchmark targets: %s\n" "$BENCHMARK_TARGETS"' - cmake -B $BUILD_DIR -S $CI_PROJECT_DIR -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wno-#pragma-messages" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF -D BUILD_BENCHMARK=ON -D BENCHMARK_CONFIG_TUNING=ON -D GPU_TARGETS=$GPU_TARGETS -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD=14 - cmake --build . --target $BENCHMARK_TARGETS - 'rm -rf $BUILD_DIR/benchmark/benchmark*.parallel' # The autotune benchmarks get very large, above GitLabs upload limit. Fortunately they compress well. # We'll put them all in a single archive to compress them to a few hundred MB. - find benchmark -type f -executable -print0 | tar -I zstd -cvf benchmarks.tar.zstd --null -T - artifacts: paths: - $BUILD_DIR/benchmarks.tar.zstd - $BUILD_DIR/.ninja_log - $BUILD_DIR/deps/googlebenchmark/ expire_in: 1 week test: stage: test extends: - .cmake-minimum - .rules:test - .gpus:rocm needs: - job: build:cmake-minimum parallel: matrix: - BUILD_TYPE: Release BUILD_TARGET: TEST BUILD_VERSION: 14 script: - cd $BUILD_DIR - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. # This causes the hipMemcpy to fail, which is not reported as an error by HIP. # As a temporary workaround, disable the SDMA for test stability. - HSA_ENABLE_SDMA=0 ctest --output-on-failure --repeat-until-fail 2 --tests-regex "hip|$GPU_TARGET" --resource-spec-file ./resources.json --parallel $PARALLEL_JOBS .test-windows-base: stage: test extends: - .deps:rocm-windows - .gpus:rocm-gpus-windows - .deps:visual-studio-devshell - .rules:test script: - cd $CI_PROJECT_DIR/build - ctest --output-on-failure # Disabled due to extensive link times. # This is tracked in issue 679 # test-windows-debug: # extends: # - .test-windows-base # needs: # - job: build:windows # parallel: # matrix: # - BUILD_TYPE: Debug # BUILD_TARGET: TEST test-windows-release: extends: - .test-windows-base needs: - job: build:windows parallel: matrix: - BUILD_TYPE: Release BUILD_TARGET: TEST .test-package: script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D GPU_TARGETS=$GPU_TARGETS -D CMAKE_CXX_STANDARD=14 -S "$CI_PROJECT_DIR/test/extra" -B "$CI_PROJECT_DIR/package_test" - cmake --build "$CI_PROJECT_DIR/package_test" - "$CI_PROJECT_DIR/package_test/test_rocprim_package" - cd "$CI_PROJECT_DIR/package_test" - ctest --output-on-failure --repeat-until-fail 2 test:install: stage: test needs: [] tags: - rocm extends: - .cmake-minimum - .rules:test - .gpus:rocm-gpus script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D CMAKE_CXX_STANDARD=14 -B build -S $CI_PROJECT_DIR - $SUDO_CMD cmake --build build --target install - !reference [.test-package, script] test:deb: stage: test needs: - build:package tags: - rocm extends: - .cmake-minimum - .rules:test - .gpus:rocm-gpus script: - $SUDO_CMD dpkg -i $PACKAGE_DIR/rocprim*.deb - !reference [.test-package, script] test:docs: stage: test variables: SPHINX_DIR: $DOCS_DIR/sphinx extends: - .rules:test - .build:docs .benchmark-base: stage: benchmark extends: - .rules:benchmark variables: BENCHMARK_RESULT_DIR: ${CI_PROJECT_DIR}/benchmark_results BENCHMARK_RESULT_CACHE_DIR: ${BENCHMARK_RESULT_DIR}_cache benchmark: needs: - job: build:cmake-minimum parallel: matrix: - BUILD_TYPE: Release BUILD_TARGET: BENCHMARK BUILD_VERSION: 14 extends: - .cmake-minimum - .gpus:rocm - .benchmark-base variables: BENCHMARK_FILENAME_REGEX: ^benchmark BENCHMARK_ALGORITHM_REGEX: "" BENCHMARK_SEED: random script: - 'printf "CI Variables used in benchmarks:\nBENCHMARK_RESULT_DIR: %s\nBENCHMARK_FILENAME_REGEX: %s\nBENCHMARK_ALGORITHM_REGEX: %s \n" "$BENCHMARK_RESULT_DIR" "$BENCHMARK_FILENAME_REGEX" "$BENCHMARK_ALGORITHM_REGEX"' - cd "${CI_PROJECT_DIR}" - mkdir -p "${BENCHMARK_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py --benchmark_dir "${BUILD_DIR}/benchmark" --benchmark_gpu_architecture "${GPU_TARGET}" --benchmark_output_dir "${BENCHMARK_RESULT_DIR}" --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}" --seed "${BENCHMARK_SEED}" artifacts: paths: - ${BENCHMARK_RESULT_DIR} expire_in: 1 week benchmark:cache-or-report: needs: - benchmark extends: - .benchmark-base tags: - single-cache cache: key: benchmark-cache paths: - ${BENCHMARK_RESULT_CACHE_DIR} script: # If on MR branch, generate report, else cache results - > if [ ! -z "${CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" ]; then if [ ! -d "${BENCHMARK_RESULT_CACHE_DIR}" ]; then echo 'ERROR: Cache directory does not exist' exit 1 elif [ ! -d "${BENCHMARK_RESULT_DIR}" ]; then echo 'ERROR: Benchmark results directory does not exist' exit 1 else echo 'INFO: Files in cache (reference benchmarks):' ls -al ${BENCHMARK_RESULT_CACHE_DIR} echo 'INFO: Generating report...' python3 .gitlab/generate_report.py --old ${BENCHMARK_RESULT_CACHE_DIR} --new ${BENCHMARK_RESULT_DIR} fi elif [ "${CI_COMMIT_BRANCH}" == "${CI_DEFAULT_BRANCH}" ]; then echo 'INFO: Caching benchmark results...' mkdir -p ${BENCHMARK_RESULT_CACHE_DIR} cp -R ${BENCHMARK_RESULT_DIR}/*.json ${BENCHMARK_RESULT_CACHE_DIR} else echo 'ERROR: Neither on a merge-request branch or the default branch' exit 1 fi .autotune-base: stage: autotune extends: - .rules:manual variables: AUTOTUNE_RESULT_DIR: ${CI_PROJECT_DIR}/autotune_results autotune:execute-tuning: needs: - autotune:build extends: - .autotune-base - .cmake-minimum - .gpus:rocm variables: AUTOTUNE_FILENAME_REGEX: ^benchmark AUTOTUNE_ALGORITHM_REGEX: "" AUTOTUNE_SIZE: "" AUTOTUNE_TRIALS: "" timeout: 8h artifacts: paths: - ${AUTOTUNE_RESULT_DIR}/*.json before_script: - !reference [".cmake-minimum", before_script] - $SUDO_CMD apt-get update -qq - $SUDO_CMD apt-get install -qq -y zstd script: - cd "${CI_PROJECT_DIR}" - tar -I zstd -xvf "${BUILD_DIR}/benchmarks.tar.zstd" -C "${BUILD_DIR}/" - | if [ ! -d "${BUILD_DIR}/benchmark" ]; then echo "There are no benchmark executables. Run the build job with a BUILD_TARGET." exit 1 fi - mkdir -p "${AUTOTUNE_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py --benchmark_dir="${BUILD_DIR}/benchmark" --benchmark_gpu_architecture="${GPU_TARGET}" --benchmark_output_dir="${AUTOTUNE_RESULT_DIR}" --benchmark_filename_regex="${AUTOTUNE_FILENAME_REGEX}" --benchmark_filter_regex="${AUTOTUNE_ALGORITHM_REGEX}" --size="${AUTOTUNE_SIZE}" --trials="${AUTOTUNE_TRIALS}" --seed=82589933 autotune:generate-config: image: python:3.10.5-buster needs: - job: "autotune:execute-tuning" optional: true extends: - .rules:manual - .autotune-base variables: AUTOTUNE_CONFIG_REPO_PATH: /rocprim/include/rocprim/device/detail/config AUTOTUNE_RESULT_CACHE_DIR: ${AUTOTUNE_RESULT_DIR}_cache tags: - single-cache cache: key: autotune-cache paths: - autotune_results_cache/ script: # Set cache dir variables depending on if this is a MR or not - > if [ ! -z "${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}" ]; then AUTOTUNE_RESULT_CACHE_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}" else AUTOTUNE_RESULT_CACHE_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_COMMIT_BRANCH}" fi # If the global cache dir does not exist, create it - mkdir -p $AUTOTUNE_RESULT_CACHE_DIR # If there are fresh results in the artifacts, cache them in the branch cache # If there are no fresh results, check branch cache # If there are no branch cache results, check TARGET branch cache # If there are TARGET branch cache results, cache them in the branch cache - > if [ -d "$AUTOTUNE_RESULT_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_CACHE_BRANCH_DIR cp -R -u ${AUTOTUNE_RESULT_DIR}/*.json ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR} elif [ -d "$AUTOTUNE_RESULT_CACHE_BRANCH_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_DIR cp -R -u ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR}/*.json ${AUTOTUNE_RESULT_DIR} elif [ -d "$AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_DIR cp -R -u ${AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR}/*.json ${AUTOTUNE_RESULT_DIR} mkdir -p $AUTOTUNE_RESULT_CACHE_BRANCH_DIR cp -R -u ${AUTOTUNE_RESULT_DIR}/*.json ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR} else echo 'ERROR: No autotune results found in previous artifacts, the branch cache or the target branch cache...' exit 1 fi # List the final .json files to use for config generation - ls -al ${AUTOTUNE_RESULT_DIR} - cd "${CI_PROJECT_DIR}" - python3 -m pip install jinja2 - mkdir -p ${AUTOTUNE_CONFIG_DIR}${AUTOTUNE_CONFIG_REPO_PATH} - python3 scripts/autotune/create_optimization.py --benchmark_files ${AUTOTUNE_RESULT_DIR}/*.json --out_basedir "${AUTOTUNE_CONFIG_DIR}${AUTOTUNE_CONFIG_REPO_PATH}" artifacts: paths: - ${AUTOTUNE_CONFIG_DIR} scheduled-check-changes: stage: autotune extends: .rules:scheduled-check-changes rocPRIM-rocm-6.4.3/.gitlab/000077500000000000000000000000001502235215600152565ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.gitlab/generate_report.py000066400000000000000000000122301502235215600210130ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import json import argparse import os import re import stat import sys class bcolors: OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' def load_benchmarks(benchmark_dir): def is_benchmark_json(filename): if not re.match(r'.*\.json$', filename): return False path = os.path.join(benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether it is a regular file (S_IFREG) return (st_mode & stat.S_IFREG) def add_results(results, file_path: str): """ Adds a single file to the results. The file contains the results of benchmarks executed on a single architecture. The benchmarks within the file may belong to different algorithms. """ with open(file_path, "r+") as file_handle: # Fix Google Benchmark comma issue contents = file_handle.read() contents = re.sub(r"(\s*\"[^\"]*\"[^,])(^\s*\"[^\"]*\":)", "\\1,\\2", contents, 0, re.MULTILINE) file_handle.seek(0) file_handle.write(contents) file_handle.truncate() with open(file_path) as file_handle: benchmark_run_data = json.load(file_handle) try: arch = benchmark_run_data['context']['hdp_gcn_arch_name'].split(":")[0] results.setdefault(arch, {}) for single_benchmark in benchmark_run_data['benchmarks']: name = single_benchmark['name'].replace('/manual_time','') name = re.sub(r"(^device.*?)(,\s[A-z_]*_config.*>)$", "\\1>", name, 0, re.MULTILINE) results[arch][name] = single_benchmark['bytes_per_second'] except KeyError as err: print(f'KeyError: {err}, while reading file: {file_path}', file=sys.stderr, flush=True) benchmark_names = [name for name in os.listdir(benchmark_dir) if is_benchmark_json(name)] print('The following benchmark results will be reported:\n{}'.format('\n'.join(benchmark_names))) # Results is: {arch : {algorithm : bytes_per_second}, ...} results = {} for benchmark_name in benchmark_names: path = os.path.join(benchmark_dir, benchmark_name) add_results(results, path) return results def compare_results(old, new): results = [] incomparable = 0 for (arch, names) in new.items(): if arch in old: for (name, value_new) in names.items(): if name in old[arch]: results.append((f'{name} ({arch})', ((value_new - old[arch][name]) / old[arch][name]) * 100)) else: incomparable = incomparable + 1 if(incomparable > 0): print(f'Could not compare {incomparable} benchmarks.') print(f'----------------------------------------') success = True results.sort(key = lambda x: x[0]) for (name, difference) in results: if difference < -10: success = False print(f'{bcolors.FAIL}X {bcolors.ENDC} {name}: {bcolors.FAIL}{difference:.0f}{bcolors.ENDC}%') elif difference < -2: success = False print(f'{bcolors.WARNING}! {bcolors.ENDC} {name}: {bcolors.WARNING}{difference:.0f}{bcolors.ENDC}%') else: print(f'{bcolors.OKGREEN}OK{bcolors.ENDC} {name}: {bcolors.OKGREEN}{difference:.0f}{bcolors.ENDC}%') return success def main(): parser = argparse.ArgumentParser() parser.add_argument('--old', help='The local directory that contains the old benchmark json files', required=True) parser.add_argument('--new', help='The local directory that contains the new benchmark json files', required=True) args = parser.parse_args() old_benchmarks = load_benchmarks(args.old) new_benchmarks = load_benchmarks(args.new) return compare_results(old_benchmarks, new_benchmarks) if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-6.4.3/.gitlab/run_benchmarks.py000077500000000000000000000122301502235215600206320ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import argparse from collections import namedtuple import os import re import stat import subprocess import sys BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed']) def run_benchmarks(benchmark_context): def is_benchmark_executable(filename): if not re.match(benchmark_context.benchmark_filename_regex, filename): return False path = os.path.join(benchmark_context.benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether there is any execution flag set # and it is a regular file (S_IFREG) return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG) success = True benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)] print('The following benchmarks will be ran:\n{}'.format('\n'.join(benchmark_names)), file=sys.stderr, flush=True) for benchmark_name in benchmark_names: results_json_name = f'{benchmark_name}_{benchmark_context.gpu_architecture}.json' benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name) results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name) args = [ benchmark_path, '--name_format', 'json', '--benchmark_out_format=json', f'--benchmark_out={results_json_path}', f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' ] if benchmark_context.size: args += ['--size', benchmark_context.size] if benchmark_context.trials: args += ['--trials', benchmark_context.trials] if benchmark_context.seed: args += ['--seed', benchmark_context.seed] try: subprocess.check_call(args) except subprocess.CalledProcessError as error: print(f'Could not run benchmark at {benchmark_path}. Error: "{error}"', file=sys.stderr, flush=True) success = False return success def main(): parser = argparse.ArgumentParser() parser.add_argument('--benchmark_dir', help='The local directory that contains the benchmark executables', required=True) parser.add_argument('--benchmark_gpu_architecture', help='The architecture of the currently enabled GPU', required=True) parser.add_argument('--benchmark_output_dir', help='The directory to write the benchmarks to', required=True) parser.add_argument('--benchmark_filename_regex', help='Regular expression that controls the list of benchmark executables to run', default=r'^benchmark', required=False) parser.add_argument('--benchmark_filter_regex', help='Regular expression that controls the list of benchmarks to run in each benchmark executable', default='', required=False) parser.add_argument('--size', help='Controls the number of processed items in each benchmark', default='', required=False) parser.add_argument('--trials', help='Controls the number of trial iterations for each benchmark case', default='', required=False) parser.add_argument('--seed', help='Controls the seed for random number generation for each benchmark case', default='', required=False) args = parser.parse_args() benchmark_context = BenchmarkContext( args.benchmark_gpu_architecture, args.benchmark_output_dir, args.benchmark_dir, args.benchmark_filename_regex, args.benchmark_filter_regex, args.size, args.trials, args.seed) benchmark_run_successful = run_benchmarks(benchmark_context) return benchmark_run_successful if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-6.4.3/.jenkins/000077500000000000000000000000001502235215600154555ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/.jenkins/common.groovy000066400000000000000000000052761502235215600202260ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} --toolchain=toolchain-linux.cmake ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project) { String sudo = auxiliary.sudo(platform.jenkinsLabel) def testCommand = "ctest --output-on-failure " def testCommandExcludeRegex = '' def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\"" def hmmExcludeRegex = '' def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\"" def hmmTestCommand = '' if (platform.jenkinsLabel.contains('gfx90a')) { echo("HMM TESTS DISABLED") /*hmmTestCommand = """ export HSA_XNACK=1 export ROCPRIM_USE_HMM=1 ${testCommand} ${hmmTestCommandExclude} """*/ } echo(env.JOB_NAME) if (env.JOB_NAME.contains('bleeding-edge')) { testCommand = '' testCommandExclude = '' hmmTestCommand = '' echo("TESTS DISABLED") } def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} cd ${project.testDirectory} ${testCommand} ${testCommandExclude} if (( \$? != 0 )); then exit 1 fi ${hmmTestCommand} """ platform.runCommand(this, command) } def runPackageCommand(platform, project) { def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) } return this rocPRIM-rocm-6.4.3/.jenkins/precheckin.groovy000066400000000000000000000043441502235215600210440ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocPRIM', 'PreCheckin') prj.paths.build_command = './install -c' prj.timeout.compile = 600 def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } rocPRIM-rocm-6.4.3/.jenkins/static.groovy000066400000000000000000000044031502235215600202140ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocPRIM', 'static') prj.paths.build_command = './install -c -s' prj.timeout.compile = 600 prj.timeout.packaging = 120 def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } rocPRIM-rocm-6.4.3/.readthedocs.yaml000066400000000000000000000005021502235215600171620ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "3.10" rocPRIM-rocm-6.4.3/CHANGELOG.md000066400000000000000000000646331502235215600155630ustar00rootroot00000000000000# Changelog for rocPRIM Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). ## rocPRIM 3.4.1 for ROCm 6.4.2 ### Upcoming changes * Changes to the template parameters of warp and block algorithms will be made in an upcoming release. ### Deprecations * Due to an upcoming compiler change the following warp size-related symbols will be removed in the next major release and are thus marked as deprecated: * `rocprim::device_warp_size()` * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory. * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().` * `rocprim::warp_size()` * `ROCPRIM_WAVEFRONT_SIZE ## rocPRIM 3.4.0 for ROCm 6.4.0 ### Added * Added extended tests to `rtest.py`. These tests are extra tests that did not fit the criteria of smoke and regression tests. These tests will take much longer to run relative to smoke and regression tests. * Use `python rtest.py [--emulation|-e|--test|-t]=extended` to run these tests. * Added regression tests to `rtest.py`. Regression tests are a subset of tests that caused hardware problems for past emulation environments. * Can be run with `python rtest.py [--emulation|-e|--test|-t]=regression` * Added the parallel `find_first_of` device function with autotuned configurations, this function is similar to `std::find_first_of`, it searches for the first occurrence of any of the provided elements. * Added `--emulation` option added for `rtest.py` * Unit tests can be run with `[--emulation|-e|--test|-t]=` * Added tuned configurations for segmented radix sort for gfx942 to improve performance on this architecture. * Added a parallel device-level function, `rocprim::adjacent_find`, similar to the C++ Standard Library `std::adjacent_find` algorithm. * Added configuration autotuning to device adjacent find (`rocprim::adjacent_find`) for improved performance on selected architectures. * Added rocprim::numeric_limits which is an extension of `std::numeric_limits`, which includes support for 128-bit integers. * Added rocprim::int128_t and rocprim::uint128_t which are the __int128_t and __uint128_t types. * Added the parallel `search` and `find_end` device functions similar to `std::search` and `std::find_end`, these functions search for the first and last occurrence of the sequence respectively. * Added a parallel device-level function, `rocprim::search_n`, similar to the C++ Standard Library `std::search_n` algorithm. * Added new constructors and a `base` function, and added `constexpr` specifier to all functions in `rocprim::reverse_iterator` to improve parity with the C++17 `std::reverse_iterator`. * Added hipGraph support to device run-length-encode for non trivial runs (`rocprim::run_length_encode_non_trivial_runs`). * Added configuration autotuning to device run-length-encode for non trivial runs (`rocprim::run_length_encode_non_trivial_runs`) for improved performance on selected architectures. * Added configuration autotuning to device run-length-encode for trivial runs (`rocprim::run_length_encode`) for improved performance on selected architectures. * Added a new type traits interface to enable users to provide additional type trait information to rocPRIM, facilitating better compatibility with custom types. ### Changed * Changed the subset of tests that are run for smoke tests such that the smoke test will complete with faster run-time and to never exceed 2GB of vram usage. Use `python rtest.py [--emulation|-e|--test|-t]=smoke` to run these tests. * The `rtest.py` options have changed. `rtest.py` is now run with at least either `--test|-t` or `--emulation|-e`, but not both options. * Changed the internal algorithm of block radix sort to use rank match to improve performance of various radix sort related algorithms. * Disabled padding in various cases where higher occupancy resulted in better performance despite more bank conflicts. * Removed HIP-CPU support. HIP-CPU support was experimental and broken. * Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release. * You can use CMake HIP language support with CMake 3.18 and later. To use HIP language support, run `cmake` with `-DUSE_HIPCXX=ON` instead of setting the `CXX` variable to the path to a HIP-aware compiler. ### Resolved issues * Fixed an issue where `rmake.py` would generate wrong CMAKE commands while using Linux environment * Fixed an issue where `rocprim::partial_sort_copy` would yield a compile error if the input iterator is const. * Fixed incorrect 128-bit signed and unsigned integers type traits. * Fixed compilation issue when `rocprim::radix_key_codec<...>` is specialized with a 128-bit integer. * Fixed the warp-level reduction `rocprim::warp_reduce.reduce` DPP implementation to avoid undefined intermediate values during the reduction. * Fixed an issue that caused a segmentation fault when `hipStreamLegacy` was passed to some API functions. ### Upcoming changes * Using the initialisation constructor of `rocprim::reverse_iterator` will throw a deprecation warning. It will be marked as explicit in the next major release. * Using the initialisation constructor of rocprim::reverse_iterator will throw a deprecation warning. It will be marked as explicit in the next major release. ## rocPRIM 3.3.0 for ROCm 6.3.0 ### Added * Changed the default value of `rmake.py -a` to `default_gpus`. This is equivalent to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201`. * The `--test smoke` option has been added to `rtest.py`. When `rtest.py` is called with this option it runs a subset of tests such that the total test time is 5 minutes. Use `python3 ./rtest.py --test smoke` or `python3 ./rtest.py -t smoke` to run the smoke test. * The `--seed` option has been added to `run_benchmarks.py`. The `--seed` option specifies a seed for the generation of random inputs. When the option is omitted, the default behavior is to use a random seed for each benchmark measurement. * Added configuration autotuning to device partition (`rocprim::partition`, `rocprim::partition_two_way`, and `rocprim::partition_three_way`), to device select (`rocprim::select`, `rocprim::unique`, and `rocprim::unique_by_key`), and to device reduce by key (`rocprim::reduce_by_key`) to improve performance on selected architectures. * Added `rocprim::uninitialized_array` to provide uninitialized storage in local memory for user-defined types. * Added large segment support for `rocprim:segmented_reduce`. * Added a parallel `nth_element` device function similar to `std::nth_element`. `nth_element` places elements that are smaller than the nth element before the nth element, and elements that are bigger than the nth element after the nth element. * Added deterministic (bitwise reproducible) algorithm variants `rocprim::deterministic_inclusive_scan`, `rocprim::deterministic_exclusive_scan`, `rocprim::deterministic_inclusive_scan_by_key`, `rocprim::deterministic_exclusive_scan_by_key`, and `rocprim::deterministic_reduce_by_key`. These provide run-to-run stable results with non-associative operators such as float operations, at the cost of reduced performance. * Added a parallel `partial_sort` and `partial_sort_copy` device functions similar to `std::partial_sort` and `std::partial_sort_copy`. `partial_sort` and `partial_sort_copy` arrange elements such that the elements are in the same order as a sorted list up to and including the middle index. ### Changed * Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different. * Changed the default seed for `device_benchmark_segmented_reduce`. * Changed `test_utils_hipgraphs.hpp` to be a class `GraphHelper` with internal graph and graph instances ### Removed * `rocprim::thread_load()` and `rocprim::thread_store()` have been deprecated. Use `dereference()` instead. ### Resolved issues * Fixed an issue in `rmake.py` where the list storing cmake options would contain individual characters instead of a full string of options. * Resolved an issue in `rtest.py` where it crashed if the `build` folder was created without `release` or `debug` subdirectories. * Resolved an issue with `rtest.py` on Windows where passing an absolute path to `--install_dir` caused a `FileNotFound` error. * rocPRIM functions are no longer forcefully inlined on Windows. This significantly reduces the build time of debug builds. * `block_load`, `block_store`, `block_shuffle`, `block_exchange`, and `warp_exchange` now use placement `new` instead of copy assignment (`operator=`) when writing to local memory. This fixes the behavior of custom types with non-trivial copy assignments. * Fixed a bug in the generation of input data for benchmarks, which caused incorrect performance to be reported in specific cases. It may affect the reported performance for one-byte types (`uint8_t` and `int8_t`) and instantiations of `custom_type`. Specifically, device binary search, device histogram, device merge and warp sort are affected. * Fixed a bug for `rocprim::merge_path_search` where using `unsigned` offsets would produce incorrect results. * Fixed a bug for `rocprim::thread_load` and `rocprim::thread_store` where `float` and `double` were not cast to the correct type, resulting in incorrect results. * Resolved an issue where tests where failing when they were compiled with `-D_GLIBCXX_ASSERTIONS=ON`. * Resolved an issue where algorithms that used an internal serial merge routine caused a memory access fault that resulted in potential performance drops when using block sort, device merge sort (block merge), device merge, device partial sort, and device sort (merge sort). * Fixed memory leaks in unit tests due to missing calls to `hipFree()` and the incorrect use of hipGraphs. * Fixed an issue where certain inputs to `block_sort_merge()`, `device_merge_sort_merge_path()`, `device_merge()`, and `warp_sort_stable()` caused an assertion error during the call to `serial_merge()`. ## rocPRIM 3.2.1 for ROCm 6.2.1 ### Optimizations * Improved performance of `block_reduce_warp_reduce` when warp size equals block size. ## rocPRIM-3.2.0 for ROCm 6.2.0 ### Additions * New overloads for `warp_scan::exclusive_scan` that take no initial value. These new overloads will write an unspecified result to the first value of each warp. * The internal accumulator type of `inclusive_scan(_by_key)` and `exclusive_scan(_by_key)` is now exposed as an optional type parameter. * The default accumulator type is still the value type of the input iterator (inclusive scan) or the initial value's type (exclusive scan). This is the same behaviour as before this change. * New overload for `device_adjacent_difference_inplace` that allows separate input and output iterators, but allows them to point to the same element. * New public API for deriving resulting type on device-only functions: * `rocprim::invoke_result` * `rocprim::invoke_result_t` * `rocprim::invoke_result_binary_op` * `rocprim::invoke_result_binary_op_t` * New `rocprim::batch_copy` function added. Similar to `rocprim::batch_memcpy`, but copies by element, not with memcpy. * Added more test cases, to better cover supported data types. * Updated some tests to work with supported data types. * An optional `decomposer` argument for all member functions of `rocprim::block_radix_sort` and all functions of `device_radix_sort`. To sort keys of an user-defined type, a decomposer functor should be passed. The decomposer should produce a `rocprim::tuple` of references to arithmetic types from the key. * New `rocprim::predicate_iterator` which acts as a proxy for an underlying iterator based on a predicate. It iterates over proxies that holds the references to the underlying values, but only allow reading and writing if the predicate is `true`. It can be instantiated with: * `rocprim::make_predicate_iterator` * `rocprim::make_mask_iterator` * Added custom radix sizes as the last parameter for `block_radix_sort`. The default value is 4, it can be a number between 0 and 32. * New `rocprim::radix_key_codec`, which allows the encoding/decoding of keys for radix-based sorts. For user-defined key types, a decomposer functor should be passed. ### Optimizations * Improved the performance of `warp_sort_shuffle` and `block_sort_bitonic`. * Created an optimized version of the `warp_exchange` functions `blocked_to_striped_shuffle` and `striped_to_blocked_shuffle` when the warpsize is equal to the items per thread. * Improved the performance of `device_transform`. ### Fixes * Fixed incorrect results of `warp_exchange::blocked_to_striped_shuffle` and `warp_exchange::striped_to_blocked_shuffle` when the block size is larger than the logical warp size. The test suite has been updated with such cases. * Fixed incorrect results returned when calling device `unique_by_key` with overlapping `values_input` and `values_output`. * Fixed incorrect output type used in `device_adjacent_difference`. * Hotfix for incorrect results on the GFX10 (Navi 10/RDNA1, Navi 20/RDNA2) ISA and GFX11 ISA (Navi 30 GPUs) on device scan algorithms `rocprim::inclusive_scan(_by_key)` and `rocprim::exclusive_scan(_by_key)` with large input types. * `device_adjacent_difference` now considers both the input and the output type for selecting the appropriate kernel launch config. Previously only the input type was considered, which could result in compilation errors due to excessive shared memory usage. * Fixed incorrect data being loaded with `rocprim::thread_load` when compiling with `-O0`. * Fixed a compilation failure in the host compiler when instantiating various block and device algorithms with block sizes not divisible by 64. ### Deprecations * The internal header `detail/match_result_type.hpp` has been deprecated. * `TwiddleIn` and `TwiddleOut` have been deprecated in favor of `radix_key_codec`. * The internal `::rocprim::detail::radix_key_codec` has been deprecated in favor of the new public utility with the same name. ## rocPRIM-3.1.0 for ROCm 6.1.0 ### Additions * New primitive: `block_run_length_decode` * New primitive: `batch_memcpy` ### Changes * Renamed: * `scan_config_v2` to `scan_config` * `scan_by_key_config_v2` to `scan_by_key_config` * `radix_sort_config_v2` to `radix_sort_config` * `reduce_by_key_config_v2` to `reduce_by_key_config` * `radix_sort_config_v2` to `radix_sort_config` * Removed support for custom config types for device algorithms * `host_warp_size()` was moved into `rocprim/device/config_types.hpp`; it now uses either `device_id` or a `stream` parameter to query the proper device and a `device_id` out parameter * The return type is `hipError_t` * Added support for `__int128_t` in `device_radix_sort` and `block_radix_sort` * Improved the performance of `match_any`, and `block_histogram` which uses it ### Deprecations * Removed `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config`, and `radix_sort_config` ### Fixes * Build issues with `rmake.py` on Windows when using VS 2017 15.8 or later (due to a breaking fix with extended aligned storage) * Fix tests for `block_histogram`, `block_exchange`, `device_histogram` and `device_reduce_by_key` for various types ### Known Issues * `device_run_length_encode`, `warp_exchange` and `warp_load` tests fail with `rocprim::half` ## rocPRIM-3.0.0 for ROCm 6.0.0 ### Additions - `block_sort::sort()` overload for keys and values with a dynamic size, for all block sort algorithms. Additionally, all `block_sort::sort()` overloads with a dynamic size are now supported for `block_sort_algorithm::merge_sort` and `block_sort_algorithm::bitonic_sort`. - New two-way partition primitive `partition_two_way` which can write to two separate iterators. ### Optimizations - Improved the performance of `partition`. ### Fixes - Fixed `rocprim::MatchAny` for devices with 64-bit warp size. The function `rocprim::MatchAny` is deprecated and `rocprim::match_any` is preferred instead. ## rocPRIM-2.13.1 for ROCm 5.7.0 ### Changes - Deprecated configuration `radix_sort_config` for device-level radix sort as it no longer matches the algorithm's parameters. New configuration `radix_sort_config_v2` is preferred instead. - Removed erroneous implementation of device-level `inclusive_scan` and `exclusive_scan`. The prior default implementation using lookback-scan now is the only available implementation. - The benchmark metric indicating the bytes processed for `exclusive_scan_by_key` and `inclusive_scan_by_key` has been changed to incorporate the key type. Furthermore, the benchmark log has been changed such that these algorithms are reported as `scan` and `scan_by_key` instead of `scan_exclusive` and `scan_inclusive`. - Deprecated configurations `scan_config` and `scan_by_key_config` for device-level scans, as they no longer match the algorithm's parameters. New configurations `scan_config_v2` and `scan_by_key_config_v2` are preferred instead. ### Fixes - Fixed build issue caused by missing header in `thread/thread_search.hpp`. ## rocPRIM-2.13.0 for ROCm 5.5.0 ### Additions * New block level `radix_rank` primitive * New block level `radix_rank_match` primitive * Added a stable block sorting implementation, which can be used with `block_sort` by adding the `block_sort_algorithm::stable_merge_sort` algorithm ### Changes * Improved the performance of: * `block_radix_sort` * `device_radix_sort` * `device_merge_sort` * Updated the `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) ### Known Issues * Disabled GPU error messages relating to incorrect warp operation usage with Navi GPUs on Windows (due to GPU `printf` performance issues on Windows) * When `ROCPRIM_DISABLE_LOOKBACK_SCAN` is set, `device_scan` fails for input sizes larger than `scan_config::size_limit`, which defaults to `std::numeric_limits::max()` ## rocPRIM-2.12.0 for ROCm 5.4.0 ### Changes * `device_partition`, `device_unique`, and `device_reduce_by_key` now support problem sizes larger than 2^32 items * Device algorithms now return `hipErrorInvalidValue` if the amount of passed temporary memory is insufficient * Lists of sizes for tests are unified, restored scan and reduce tests for `half` and `bfloat16` values ### Removals * `block_sort::sort()` overload for keys and values with a dynamic size * This overload was documented but the implementation is missing; to avoid further confusion, the documentation is removed until a decision is made on implementing the function ## rocPRIM-2.11.1 for ROCm 5.3.3 ### Fixes * Fixed the compilation failure in `device_merge` when the two key iterators don't match ## rocPRIM-2.11.0 for ROCm 5.3.2 ### Known Issues * `device_merge` doesn't correctly support different types for `keys_input1` and `keys_input2` (as of the 5.3.0 release) ## rocPRIM-2.11.0 for ROCm 5.3.0 ### Additions * New functions `subtract_left` and `subtract_right` in `block_adjacent_difference` to apply functions on pairs of adjacent items distributed between threads in a block * New device-level `adjacent_difference` primitives * Experimental tooling for automatic kernel configuration tuning for various architectures * Benchmarks collect and output more detailed system information * CMake functionality improves build parallelism of the test suite that splits compilation units by function or by parameters * Reverse iterator * Support for problem sizes over `UINT_MAX` in device functions `inclusive_scan_by_key` and `exclusive_scan_by_key` ## Changes * Improved the performance of warp primitives using the swizzle operation on Navi * Improved build parallelism of the test suite by splitting up large compilation units * `device_select` now supports problem sizes larger than 2^32 items * `device_segmented_radix_sort` now partitions segments to group small, medium, and large segments * Each segment group can be sorted by specialized kernels to improve throughput * Improved histogram performance for the case of highly uneven sample distribution ## rocPRIM-2.10.14 for ROCm 5.2.0 ### Additions * Packages for tests and benchmark executables on all supported operating systems using CPack * Added file and folder reorganization changes with backward compatibility support using wrapper headers ## rocPRIM-2.10.13 for ROCm 5.1.0 ### Fixes * Fixed Radix Sort `int64_t` bug introduced in version 2.10.11 ### Additions * Future value * Device `partition_three_way` to partition input to three output iterators based on two predicates ### Changes * The reduce/scan algorithm precision issues in the tests has been resolved for half types * The device Radix Sort algorithm supports indexing with 64-bit unsigned integers * The indexer type is chosen based on the type argument of parameter `size` * If `sizeof(size)` is not larger than 4 bytes, the indexer type is 32-bit unsigned int, otherwise, the indexer type is 64-bit unsigned int * The maximum problem size is based on the compile time configuration of the algorithm according to the following formula: * `max_problem_size = (UINT_MAX + 1) * config::scan::block_size * config::scan::items_per_thread` ### Deprecations * Flags API of `block_adjacent_difference` ### Known issues * `device_segmented_radix_sort` unit test is failing for HIP on Windows ## rocPRIM-2.10.12 for ROCm 5.0.0 ### Fixes * Enable bfloat16 tests and reduce threshold for bfloat16 * Fix device scan `limit_size` feature * Non-optimized builds no longer trigger local memory limit errors ### Additions * Scan size limit feature * Reduce size limit feature * Transform size limit feature * `block_load_striped` and `block_store_striped` * `gather_to_blocked` to gather values from other threads into a blocked arrangement * The block sizes for device merge sorts initial block sort and its merge steps are now separate in its kernel config * The block sort step supports multiple items per thread ### Changes * you can now set the `size_limit` for scan, reduce, and transform in the config struct instead of using a parameter * `device_scan` and `device_segmented_scan`: `inclusive_scan` now uses the `input-type` as `accumulator-type`; `exclusive_scan` uses `initial-value-type` * This changes the behavior of small-size input types with large-size output types (e.g., `short` input, `int` output) and low-res input with high-res output (e.g., `float` input, `double` output) * Revert an old Fiji workaround because they solved the issue at the compiler side * Update README CMake minimum version number * Added block sort support for multiple items per thread * Currently only powers of two block sizes, and items per threads are supported and only for full blocks * Bumped the minimum required version of CMake to 3.16 ### Known issues * `device_segmented_radix_sort` and `device_scan` unit tests failing for HIP on Windows * `ReduceEmptyInput` causes random failure with bfloat16 ## rocPRIM-2.10.11 for ROCm 4.5.0 ### Additions * Initial HIP on Windows support * bfloat16 support added ### Changes * Packaging has been split into a runtime package (`rocprim`) and a development package (`rocprim-devel`): The development package depends on the runtime package. When installing the runtime package, the package manager will suggest the installation of the development package to aid users transitioning from the previous version's combined package. This suggestion by package manager is for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` feature in the runtime package is introduced as a deprecated feature and will be removed in a future ROCm release. * Because rocPRIM is a header-only library, the runtime package is an empty placeholder used to aid in the transition. This package is also a deprecated feature and will be removed in a future rocm release. ### Known issues * Unit tests may soft hang on MI200 when running in `hipMallocManaged` mode ## rocPRIM-2.10.11 for ROCm 4.4.0 ### Additions * Code coverage tools build option * AddressSanitizer build option * gfx1030 support added * Experimental [HIP-CPU](https://github.com/ROCm-Developer-Tools/HIP-CPU) support; build using GCC/Clang/MSVC on Windows and Linux (this is work in progress and many algorithms are known to fail) ### Optimizations * Added single tile Radix Sort for smaller sizes * Improved performance for Radix Sort for larger element sizes ## rocPRIM-2.10.10 for ROCm 4.3.0 ### Fixes * Bug fix and minor performance improvement for `merge_sort` when input and output storage are the same ### Additions * gfx90a support added ### Deprecations * `warp_size()` function; use `host_warp_size()` and `device_warp_size()` for host and device references, respectively ## rocPRIM-2.10.9 for ROCm 4.2.0 ### Fixes * Size zero inputs are now properly handled with newer ROCm builds that no longer allow zero-size kernel grid and block dimensions ### Changes * Minimum CMake version required is now 3.10.2 ### Known issues * Device scan unit test is currently failing due to an LLVM bug ## rocPRIM-2.10.8 for ROCm 4.1.0 ### Fixes * Texture cache iteration support has been re-enabled * Benchmark builds have been re-enabled * Unique operator is no longer called on invalid elements ### Known issues * Device scan unit test is currently failing because of an LLVM bug ## rocPRIM-2.10.7 for ROCm 4.0.0 * No new features ## rocPRIM-2.10.6 for ROCm 3.10 ### Optimizations * Updates to DPP instructions for warp shuffle ### Known issues * Benchmark builds are disabled due to compiler bug ## rocPRIM-2.10.5 for ROCm 3.9.0 ### Additions * HIP CMake dependency ### Optimizations * Updates to warp shuffle for gfx10 * Disabled DPP functions on gfx10++ ### Known issues * Benchmark builds are disabled due to compiler bug ## rocPRIM-2.10.4 for ROCm 3.8.0 ### Fixes * Fix for rocPRIM texture cache iterator ## rocPRIM-2.10.3 for ROCm 3.7.0 ### Fixes * Package dependency correct to `hip-rocclr` ### Known issues * rocPRIM texture cache iterator functionality is broken in the runtime (this will be fixed in the next release); you can use the prior release if calling this function ## rocPRIM-2.10.2 for ROCm 3.6.0 * No new features ## rocPRIM-2.10.1 for ROCm 3.5.1 ### Fixes * Point release with compilation fix ## rocPRIM-2.10.1 for ROCm 3.5.0 ### Additions * Improved tests with fixed and random seeds for test data * Network interface improvements with API v3 ### Changes * Switched to HIP-Clang as the default compiler * CMake searches for rocPRIM locally first; if t's not found, CMake downloads it from GitHub rocPRIM-rocm-6.4.3/CMakeLists.txt000066400000000000000000000224501502235215600165010ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.25) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") # rocPRIM project project(rocprim LANGUAGES CXX) # Set CXX flags if (NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) # Set HIP flags set(CMAKE_HIP_STANDARD 14) set(CMAKE_HIP_STANDARD_REQUIRED ON) set(CMAKE_HIP_EXTENSIONS OFF) # Set CXX standard if (CMAKE_CXX_STANDARD EQUAL 14) message(WARNING "C++14 will be deprecated in the next major release") elseif(NOT CMAKE_CXX_STANDARD EQUAL 17) message(FATAL_ERROR "Only C++14 and C++17 are supported") endif() if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) else() set(ROCPRIM_PROJECT_IS_TOP_LEVEL FALSE) endif() #Adding CMAKE_PREFIX_PATH if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() include(CheckLanguage) include(CMakeDependentOption) # Build options # Disables building tests, benchmarks, examples option(ONLY_INSTALL "Only install" OFF) cmake_dependent_option(BUILD_TEST "Build tests (requires googletest)" OFF "NOT ONLY_INSTALL" OFF) cmake_dependent_option(BUILD_BENCHMARK "Build benchmarks" OFF "NOT ONLY_INSTALL" OFF) cmake_dependent_option(BUILD_EXAMPLE "Build examples" OFF "NOT ONLY_INSTALL" OFF) option(BUILD_NAIVE_BENCHMARK "Build naive benchmarks" OFF) cmake_dependent_option(BUILD_DOCS "Build documentation (requires sphinx)" OFF "NOT ONLY_INSTALL" OFF) option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF) option(ROCPRIM_INSTALL "Enable installation of rocPRIM (projects embedding rocPRIM may want to turn this OFF)" ON) check_language(HIP) cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) else() set(ROCPRIM_PROJECT_IS_TOP_LEVEL FALSE) endif() #Adding CMAKE_PREFIX_PATH if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${ROCM_PATH}/lib/cmake/hip ${HIP_PATH}/cmake ${ROCM_ROOT}/lib/cmake/hip ${ROCM_ROOT}/hip/cmake # FindHIP.cmake ) # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to 'Release' as none was specified.") set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") if(DEFINED BUILD_SHARED_LIBS) set(PKG_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) else() set(PKG_BUILD_SHARED_LIBS ON) endif() set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared # Get dependencies (required here to get rocm-cmake) include(cmake/Dependencies.cmake) # Use target ID syntax if supported for GPU_TARGETS if(USE_HIPCXX) enable_language(HIP) else() if (NOT DEFINED AMDGPU_TARGETS) set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") else() set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") endif() set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") if(GPU_TARGETS STREQUAL "all") if(BUILD_ADDRESS_SANITIZER) # ASAN builds require xnack rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" ) else() rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" ) endif() set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() endif() # TODO: Fix VerifyCompiler for HIP on Windows if (NOT WIN32) include(cmake/VerifyCompiler.cmake) endif() list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm ${ROCM_ROOT}/llvm ${ROCM_ROOT} ${ROCM_ROOT}/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) # FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) if(ROCPRIM_INSTALL AND BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( "${PROJECT_SOURCE_DIR}/rocprim/include/rocprim" WRAPPER_LOCATIONS rocprim/include/rocprim OUTPUT_LOCATIONS rocprim/wrapper/include/rocprim PATTERNS *.hpp ) endif() if(BUILD_CODE_COVERAGE) add_compile_options(-fprofile-arcs -ftest-coverage) add_link_options(--coverage) endif() # Setup VERSION set(VERSION_STRING "3.4.1") rocm_setup_version(VERSION ${VERSION_STRING}) # Print configuration summary include(cmake/Summary.cmake) print_configuration_summary() # rocPRIM library add_subdirectory(rocprim) if(ROCPRIM_PROJECT_IS_TOP_LEVEL AND (BUILD_TEST OR BUILD_BENCHMARK)) rocm_package_setup_component(clients) endif() # Tests if(BUILD_TEST) if (ROCPRIM_PROJECT_IS_TOP_LEVEL) rocm_package_setup_client_component(tests) endif() enable_testing() add_subdirectory(test) endif() # Benchmarks if(BUILD_BENCHMARK) if (ROCPRIM_PROJECT_IS_TOP_LEVEL) rocm_package_setup_client_component(benchmarks) endif() add_subdirectory(benchmark) endif() # Examples if(BUILD_EXAMPLE) add_subdirectory(example) endif() # Docs if(BUILD_DOCS) add_subdirectory(docs) endif() # set BUILD_SHARED_LIBS for packaging set(BUILD_SHARED_LIBS ${PKG_BUILD_SHARED_LIBS}) # Package if (ROCPRIM_PROJECT_IS_TOP_LEVEL) # add dependency on HIP runtime set(HIP_RUNTIME_MINIMUM 4.5.0) if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() rocm_package_add_dependencies(SHARED_DEPENDS "${DEPENDS_HIP_RUNTIME} >= ${HIP_RUNTIME_MINIMUM}") rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}") rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") set(CPACK_RPM_PACKAGE_LICENSE "MIT") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) rocm_create_package( NAME rocprim DESCRIPTION "Radeon Open Compute Parallel Primitives Library" MAINTAINER "rocPRIM Maintainer " HEADER_ONLY ) endif() # # ADDITIONAL TARGETS FOR CODE COVERAGE # if(BUILD_CODE_COVERAGE) # # > make coverage_cleanup (clean coverage related files.) # > # run your tests # > make coverage (generate html documentation) # # # Prepare coverage output # This little script is generated because the option '--gcov-tool ' of lcov cannot take arguments. # add_custom_target(coverage DEPENDS rocprim COMMAND mkdir -p lcoverage COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh COMMAND chmod +x llvm-gcov.sh ) # # Generate coverage output. # add_custom_command(TARGET coverage COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info COMMAND genhtml lcoverage/main_coverage.info --output-directory lcoverage ) # # Coverage cleanup # add_custom_target(coverage_cleanup COMMAND find ${CMAKE_BINARY_DIR} -name *.gcda -delete WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() rocPRIM-rocm-6.4.3/CONTRIBUTING.md000066400000000000000000000427671502235215600162070ustar00rootroot00000000000000 # Contributing to rocPRIM # We welcome contributions to rocPRIM. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgement for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## rocPRIM provides a number of foundational parallel algorithms that are optimized for AMD ROCm platforms. The purpose of the library is to provide a reliable, performant foundation upon which other libraries can be built. The library is written in HIP, targeting AMD's ROCm platform. Correctness and performance are both important goals in rocPRIM. Because of this, new changes should include both **test** and **benchmark** coverage. Tests and benchmarks should be broad enough to ensure that code runs correctly and performs well across a variety of input types and sizes. More specifically: - Tests must cover all the functionality added to the public API. - Tests must cover the whole range of supported sizes, not by testing every single possible size but rather using representative sizes that ensure that the algorithms run succesfully with any size from the range. - On this note, it also needs to be taken into account that some algorithms have support for large indices (indices that cannot be stored in a 32-bit integer), so input sizes should also cover that case. - Tests and benchmarks must be instantiated with all supported data types. - If the algorithm uses multiple data types (for instance, if it uses different types for input and output), a selected and representative few combinations should be tested instead of the full combination matrix. We also employ automated testing and benchmarking via checks that are run when a pull request is created. These checks: - test all algorithms for correctness across a variety of input configurations (eg. types, sizes, etc.) - run benchmarks to check for performance degradation - test the change on various OS platforms (Ubuntu, RHEL, etc.) - build and run the code on different GPU architectures (MI-series, Radeon series cards, etc.) ## Code Structure ## rocPRIM is a header-only library. Library code is located inside of `rocprim/include/rocprim/`, and within the `rocprim` namespace. Note that all the symbols inside the `rocprim::detail` namespace are not part of the public API. Algorithms are grouped by the level-of-scope at which they operate. The following subdirectories organize them by hardware-level scope: * `device/`: contains headers for device-level algorithms, which are to be called from host code. * `block/`: contains headers for block-level algorithms, only callable from device code. * `warp/`: contains headers for warp/wavefront-level algorithms, only callable from device code. * `thread/`: contains headers for thread-level algorithms, only callable from device code. Supporting code is distributed into several subdirectories depending on its scope: * `detail/`: utility functions and structs for the internal state of algorithms. * `detail/config/`: configs for tuned algorithms (see [tuning](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/concepts/tuning.html)). * `intrinsics/`: specialized intrinsic functions (eg. atomics, warp-shuffling, bit manipulation, etc.). Some of them are just wrappers around HIP's intrinsics, atomic/warp-shuffle functions or compiler's intrinsics. * `iterator/`: iterators that are used to interact with most algorithms in the library (like `constant_iterator` for iterating over a homogeneous range of values or `transform_iterator` for applying a transformation to a given range of values). * `types/`: a number of convenient types used in the library (eg. for storing future values, compile-time integer sequences, etc.). Correctness code (tests) is located inside the `test` folder. Several test suites exist depending on what they assess: * `extra`: test suite that should be run after rocPRIM is installed from package or from source. It is a short smoke test to verify the correctness of the installation or packaging process. * `hip`: test suite that checks HIP functionality that is of particular interest to rocPRIM. * `hipgraph`: test suite for verifying that rocPRIM's algorithms work with `hipGraph`. * `rocprim`: test suite for checking the correctness of rocPRIM's algorithms. Finally, performance code (benchmarks) is located inside the `benchmark` folder. Tuned algorithms use three files: * `benchmark/benchmark_.cpp` * `benchmark/benchmark_.parallel.cpp.in` * `benchmark/benchmark_.parallel.hpp` while non-tuned algorithms have only one `benchmark/benchmark_.cpp` file. ## Coding Style ## C and C++ code should be formatted using `clang-format`. Use the clang-format version shipped with ROCm, which is available in the `/opt/rocm` directory. Please do not use your system's built-in `clang-format`, as this is an older version that will have different results. The check_format script (`scripts/code-format/check-format.sh`) allows to check for formatting violations. These can be easily fixed as described below. To format a file, use: ```bash /opt/rocm/llvm/bin/clang-format -style=file -i ``` To format all modified (staged) files, use the following command inside the root directory of rocPRIM: ```bash /opt/rocm/llvm/bin/git-clang-format --style=file --binary /opt/rocm/llvm/bin/clang-format ``` Format modifications will stay unstaged, so that they can be reviewed before commiting. The formatting can also be done on a per-commit basis, by running: ```bash /opt/rocm/llvm/bin/git-clang-format --style=file --binary /opt/rocm/llvm/bin/clang-format ``` or installing githooks: ```bash ./.githooks/install ``` The githooks installed will both format the code and update the copyright dates (see [deliverables](#deliverables)). Additionally, some code editors (such as Visual Studio Code, CLion, XCode, Eclipse, Vim, etc.) have clang-format plugins available, so that formatting can be done from the editor instead of from command-line. This is especially useful for formatting while coding. ### Namespaces ### As mentioned in [Code Structure](#code-structure), rocPRIM's symbols are exposed within the `rocprim` namespace, with the exception of the ones intended for internal use which are inside `rocprim::detail`. This is done so that users can place rocPRIM in a different namespace (keeping `rocprim` as the innermost namespace) to prevent a namespace collision when two independent rocPRIM libraries end up in the same compute unit through, for instance, indirect inclusion. Therefore, files from `rocprim/include/rocprim` containing any rocPRIM symbol should start with `BEGIN_ROCPRIM_NAMESPACE` and end with `END_ROCPRIM_NAMESPACE`. These are macros that wrap the namespace opening and closing, respectively. Implementation details are put into the `rocprim::detail` namespace. No wrapping macros are defined for this one, so just the ususal ```c++ namespace detail { ... } ``` should be used when needed. ## Documenting Style ## Apart from the usual comments to ease understanding of the code, Sphinx and Doxygen are used to document the functionality available from rocPRIM. The Sphinx docs for the API are organized mostly following the code structure. The folders `_ops` (block_ops, device_ops, etc.) contain the documentation files for methods operating in the correspondent hardware levels. The documentation for supporting code is placed in separate files, located inside `docs/reference`. To connect Sphinx with Doxygen, Breathe is used. There is a Doxygen group defined for each folder under `rocprim/include/rocprim/` which has documented functionality named as `module` (for example, `threadmodule` for members of `rocprim/include/rocprim/thread` or `intrinsicsmodule` for members of `rocprim/include/rocprim/intrinsics`). Placing the contents of a file inside the correspondent Doxygen group guarantees that Sphinx will get access to the documentation inside that file. Only members of the public API need to be documented with these two tools, as in the ones outside the `rocprim::detail` namespace, as all symbols inside said namespace are excluded from the documentation. If some member does not need documentation (such as a specialization of a class that doesn't need any extra clarifications) it can be left out of Doxygen docs by encapsulating the code as shown below: ```c++ /// \cond // code without doxygen documentation here /// \endcond ``` This isn't always possible (for instance, when base classes need to be excluded), so a pre-processor approach is also available: ```c++ #ifndef DOXYGEN_DOCUMENTATION_BUILD // code without doxygen documentation here #endif // DOXYGEN_DOCUMENTATION_BUILD ``` Some files also use the following structure: ```c++ #ifndef DOXYGEN_SHOULD_SKIP_THIS // code without doxygen documentation here #endif // DOXYGEN_SHOULD_SKIP_THIS ``` New code should prefer `DOXYGEN_DOCUMENTATION_BUILD` over `DOXYGEN_SHOULD_SKIP_THIS`, as its easier to understand. `DOXYGEN_SHOULD_SKIP_THIS` is defined to be 1 when Doxygen is parsing, logically making its correct usage a double-negation. In general terms, a file properly documented should look like something along the lines of: ```c++ /// \addtogroup /// @{ BEGIN_ROCPRIM_NAMESPACE namespace detail { // here just add comments if needed ... } // end namespace detail /// \brief Some public class. /// /// Here some more info can be added to the brief description. /// \tparam A Template type used by the class. /// \tparam ... template class some_class { /// \brief A type used within the class. using class_type = some_other_type; /// \brief A method member of the class. /// /// \tparam B Another template parameter. /// \param [in] param_in_first Input parameter description. /// \param [in] param_in_second [optional] Optional input parameter description. /// \param [out] param_out Output parameter description. /// \param [in,out] param_in_out Input/Output parameter description. /// \return Returned object description. template return_type some_class_method(A param_in_first, B param_in_second = {}) { ... } } ... END_ROCPRIM_NAMESPACE /// @} // end of group ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follows the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). We also mostly abide [GitHub's best practices for pull-requests](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/getting-started/best-practices-for-pull-requests), namely: 1. **Write small PRs**. PRs should be feature-focused and respect the scope of the issue(s) that it refers to. This makes reviews easier and faster, and yields less chances of overlooking bugs in the new/modified code. 2. **Review your own PR**. Before opening/undrafting your PR, take your time to review all the changes as if you were one of the reviewers. This helps catching typos or small errors in advance. 3. **Provide context and guidance**. PRs should generally have a descriptive title and an explanatory body that includes: - **scope** (purpose) of the PR: explanation of the scope of the PR (for instance, what feature/bug the PR adds/fixes). This helps identifying new issues to be spawned from the comments received in the PR: if any comment suggests any addition/fix that falls out of this scope, a new issue should be created so that the comment is tackled in another (feature-focused) PR. - some **notes** explaining the changes/additions made so that reviewers know which decisions were taken and why. Here you can also explicitly request feedback on specific matters that you think may need to be discussed. - if necessary, **how to verify** that the issue(s) at hand are indeed tackled with this PR (something like "the newly added test covering the fixed bug's case is passing"). When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch, which serves as our integration branch. Releases are cut to `release/rocm-rel-x.y`, where x and y refer to the release major and minor numbers. ### Deliverables ### #### Correctness, performance and documentation #### Code that introduces new features should have **test coverage** and **benchmark coverage**. **Documentation** must also be added following the guidelines described in [Documentation Style](#documentation-style). If modifying existing functionality, tests, benchmarks and documentation must be updated to fit the new behavior and/or parameters. If the autotuning is run, benchmarks should be re-run to check whether performance indeed improves. If so, the new configuration files generated should be added to the corresponding PR. #### License #### rocPRIM is an open source library. Because of this, we include the **license description** shown below at the top of every source file. If you create new source files in the repository, please include this text in them as well (replacing "xx" with the digits for the current year): ```c++ // Copyright (c) 20xx Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. ``` If you modify existing files licensed in a previous year, add a dash followed by the modification year to indicate that the license also covers the most recent changes (like so: `Copyright (c) 20xx-20yy`). It may also happen that such an interval is already specified in the license, but the last year indicated is previous to the current modification date, in this case just change it accordingly. Under the `scripts/copyright-date` folder there is `check-copyright` script that we use to check if the copyright date updates are done. It can also be used to automatize those updates. Run ```bash scripts/copyright-date/check-copyright.sh -u ``` inside rocPRIM's root directory to update the copyright statements of modified files or set ```bash git config --local hooks.updateCopyright true ``` to automatically update copyrights when committing. #### Changes Record #### All noticeable changes are recorded in the `CHANGELOG.md` file. For every release, we annotate the additions, fixes, changes, deprecations and/or optimizations introduced within that release. When opening a PR, make sure to add to the correspondent sections under the latest unreleased release all the meaningful changes introduced. ### Process ### After you create a PR, you can take a look at a diff of the changes you made using the PR's "Files" tab. PRs must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. During code reviews, another developer(s) will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. When a modification request has been completed, the conversation thread about it will be marked as resolved. To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. rocPRIM-rocm-6.4.3/LICENSE.txt000066400000000000000000000021261502235215600155620ustar00rootroot00000000000000MIT License Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.rocPRIM-rocm-6.4.3/NOTICES.txt000066400000000000000000000066161502235215600156140ustar00rootroot00000000000000Notices and Licenses file ______________________________________________________________________________ AMD copyrighted code (MIT) Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ROCmSoftwarePlatform-rocPRIM v2.5.0 (MIT) Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. florianrappl-cmdparser v-u (MIT) Copyright (c) 2015 - 2016 Florian Rappl Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. rocPRIM-rocm-6.4.3/README.md000066400000000000000000000256121502235215600152230ustar00rootroot00000000000000# rocPRIM > [!NOTE] > The published documentation is available at [rocPRIM](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). rocPRIM is a header-only library that provides HIP parallel primitives. You can use this library to develop performant GPU-accelerated code on AMD ROCm platforms. ## Requirements * Git * CMake (3.16 or later) * AMD [ROCm](https://rocm.docs.amd.com/en/latest/) platform (1.8.2 or later) * Including [HIP-clang](https://github.com/ROCm/HIP/blob/master/INSTALL.md#hip-clang) compiler * C++17 * Python 3.6 or higher (HIP on Windows only, required only for install script) * Visual Studio 2019 with Clang support (HIP on Windows only) * Strawberry Perl (HIP on Windows only) Optional: * [GoogleTest](https://github.com/google/googletest) * Required only for tests. Building tests is on by default. * This is automatically downloaded and built by the CMake script. * [Google Benchmark](https://github.com/google/benchmark) * Required only for benchmarks. Building benchmarks is off by default. * This is automatically downloaded and built by the CMake script. ## Documentation Documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). ### Build documentation locally ```shell # Change directory to rocPRIM docs cd rocPRIM; cd docs # Install documentation dependencies python3 -m pip install -r sphinx/requirements.txt # Build the documentation python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html # To serve the HTML docs locally cd _build/html python3 -m http.server ``` ### Build documentation via CMake Install [rocm-cmake](https://github.com/ROCm/rocm-cmake/) ```shell # Change directory to rocPRIM cd rocPRIM # Install documentation dependencies python3 -m pip install -r docs/sphinx/requirements.txt # Set C++ compiler # This example uses hipcc and assumes it is at the path /usr/bin export CXX=hipcc export PATH=/usr/bin:$PATH # Configure the project cmake -S . -B ./build -D BUILD_DOCS=ON # Build the documentation cmake --build ./build --target doc # To serve the HTML docs locally cd ./build/docs/html python3 -m http.server ``` ## Build and install You can build and install rocPRIM on Linux or Windows. * Linux: ```shell git clone https://github.com/ROCm/rocPRIM.git # Go to rocPRIM directory, create and go to the build directory. cd rocPRIM; mkdir build; cd build # Configure rocPRIM, setup options for your system. # Build options: # ONLY_INSTALL - OFF by default, If this flag is on, the build ignore the BUILD_* flags # BUILD_TEST - OFF by default, # BUILD_EXAMPLE - OFF by default, # BUILD_BENCHMARK - OFF by default. # BENCHMARK_CONFIG_TUNING - OFF by default. The purpose of this flag to find the best kernel config parameters. # At ON the compilation time can be increased significantly. # AMDGPU_TARGETS - list of AMD architectures, default: gfx803;gfx900;gfx906;gfx908. # You can make compilation faster if you want to test/benchmark only on one architecture, # for example, add -DAMDGPU_TARGETS=gfx906 to 'cmake' parameters. # AMDGPU_TEST_TARGETS - list of AMD architectures, default: "" (default system device) # If you want to detect failures on a per GFX IP basis, setting it to some set of ips will create # separate tests with the ip name embedded into the test name. Building for all, but selecting # tests only of a specific architecture is possible for eg: ctest -R gfx803|gfx900 # # ! IMPORTANT ! # Set C++ compiler to HIP-clang. You can do it by adding 'CXX=' # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. # Using HIP-clang: [CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../. # Build make -j4 # Optionally, run tests if they're enabled. ctest --output-on-failure # Install [sudo] make install ``` * Windows: We've added initial support for HIP on Windows; to install, use the provided `rmake.py` python script: ```shell git clone https://github.com/ROCm/rocPRIM.git cd rocPRIM # the -i option will install rocPRIM to C:\hipSDK by default python rmake.py -i # the -c option will build all clients including unit tests python rmake.py -c ``` ### Using rocPRIM Include the `` header: ```cpp #include ``` We recommended including rocPRIM into a CMake project by using the package configuration files. The rocPRIM package name is `rocprim`. ```cmake # "/opt/rocm" - default install prefix find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") ... # Includes only rocPRIM headers, HIP libraries have # to be linked manually by user target_link_libraries( roc::rocprim) # Include rocPRIM headers and required HIP dependencies # - If using HIP language support (USE_HIPCXX=ON): target_link_libraries( hip::host) # - Otherwise: target_link_libraries( hip::device) ``` For more information on `hip::host` and `hip::device`, please see the [ROCm documentation](https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html#consuming-the-hip-api-in-c-code). ## Running unit tests Unit tests are implemented in terms of GoogleTest. Collections of tests are wrapped and invoked from CTest. ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # List available tests ctest --show-only # To run all tests ctest # Run specific test(s) ctest -R # To run the Google Test manually ./test/rocprim/test_ ``` ### Using multiple GPUs concurrently for testing This feature requires using CMake 3.16+ for building and testing. ```note Prior versions of CMake can't assign IDs to tests when running in parallel. Assigning tests to distinct devices could only be done at the cost of extreme complexity. ``` Unit tests can make use of the [CTest resource allocation](https://cmake.org/cmake/help/latest/manual/ctest.1.html#resource-allocation) feature, which you can use to distribute tests across multiple GPUs in an intelligent manner. This feature can accelerate testing when multiple GPUs of the same family are in a system. It can also test multiple product families from one invocation without having to use the `HIP_VISIBLE_DEVICES` environment variable. The feature relies on the presence of a resource specifications file. ```important Trying to use `RESOURCE_GROUPS` and `--resource-spec-file` with CMake and CTest for versions prior to 3.16 silently omits the feature. No warnings are issued about unknown properties or command-line arguments. Make sure that the `cmake` and `ctest` versions you invoke are sufficiently recent. ``` #### Auto resource specification generation You can independently call the utility script located in the repository using the following code: ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # Invoke directly or use CMake script mode via cmake -P ../cmake/GenerateResourceSpec.cmake # Assuming you have 2 compatible GPUs in the system ctest --resource-spec-file ./resources.json --parallel 2 ``` #### Manual Assuming you have two GPUs from the gfx900 family and they are the first devices enumerated by the system, you can use `-D AMDGPU_TEST_TARGETS=gfx900` during configuration to specify that only one family will be tested. Leaving this var empty (default) results in targeting the default device in the system. To let CMake know there are two GPUs that should be targeted, you have to provide a `JSON` file to CTest via the `--resource-spec-file ` flag. For example: ```json { "version": { "major": 1, "minor": 0 }, "local": [ { "gfx900": [ { "id": "0" }, { "id": "1" } ] } ] } ``` Invoking CTest as `ctest --resource-spec-file --parallel 2` allows two tests to run concurrently, distributed between the two GPUs. ### Using custom seeds for the tests Modify the `rocPRIM/test/rocprim/test_seed.hpp` file. ```cpp //(1) static constexpr int random_seeds_count = 10; //(2) static constexpr unsigned int seeds [] = {0, 2, 10, 1000}; //(3) static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); ``` (1) Defines a constant that sets how many passes over the tests will be done with runtime-generated seeds. Modify at will. (2) Defines the user-generated seeds. Each of the array elements will be used as seed for all tests. Modify at will. If you don't want any static seeds, leave the array empty. ```cpp static constexpr unsigned int seeds [] = {}; ``` (3) Never modify this line. ## Running benchmarks ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # To run benchmark for warp functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_warp_ [--size ] [--trials ] # To run benchmark for block functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_block_ [--size ] [--trials ] # To run benchmark for device functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_device_ [--size ] [--trials ] ``` ### Performance configuration Most device-specific primitives provided by rocPRIM can be tuned for other AMD devices, and different types and operations, by passing compile-time configuration structures as a template parameter. The main "knobs" are usually the size of the block and the number of items processed by a single thread. rocPRIM has built-in default configurations for each of its primitives. In order to use the included configurations, you need to define the macro `ROCPRIM_TARGET_ARCH` as `803` if you want the algorithms optimized for gfx803 GCN version, or to `900` for gfx900. ## hipCUB [hipCUB](https://github.com/ROCm/hipCUB/) is a thin wrapper library on top of [rocPRIM](https://github.com/ROCm/rocPRIM) or [CUB](https://github.com/NVlabs/cub). You can use it to port projects that use the CUB library to the [HIP](https://github.com/ROCm/HIP) layer and run them on AMD hardware. In the [ROCm](https://rocm.docs.amd.com/en/latest/) environment, hipCUB uses the rocPRIM library as a backend; on CUDA platforms, it uses CUB as a backend. ## Support You can report bugs and feature requests through our GitHub [issue tracker](https://github.com/ROCm/rocPRIM/issues). ## Contributions and license Contributions of any kind are most welcome! Contribution instructions are in [CONTRIBUTING](./CONTRIBUTING.md). Licensing information is in [LICENSE](./LICENSE.txt). rocPRIM-rocm-6.4.3/benchmark/000077500000000000000000000000001502235215600156705ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/benchmark/CMakeLists.txt000066400000000000000000000173471502235215600204440ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. option(BENCHMARK_CONFIG_TUNING "Benchmark device-level functions using various configs" OFF) include(../cmake/ConfigAutotune.cmake) include(ConfigAutotuneSettings.cmake) option(BENCHMARK_TUNE_PARAM_NAMES "Tuning parameter names" "") option(BENCHMARK_TUNE_PARAMS "Tuning parameters" "") if(BENCHMARK_CONFIG_TUNING) add_custom_target("benchmark_config_tuning") endif() function(add_rocprim_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) if(USE_HIPCXX) set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE HIP) endif() if(BENCHMARK_CONFIG_TUNING) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${BENCHMARK_TARGET}.parallel.cpp.in") message(STATUS "found ${BENCHMARK_TARGET}.parallel.cpp.in file, compiling in parallel.") read_config_autotune_settings(${BENCHMARK_TARGET} list_across_names list_across output_pattern_suffix) if(BENCHMARK_TUNE_PARAM_NAMES AND BENCHMARK_TUNE_PARAMS) set(list_across_names "${BENCHMARK_TUNE_PARAM_NAMES}") set(list_across "${BENCHMARK_TUNE_PARAMS}") endif() #make sure that variables are not empty, i.e. there actually is an entry for that benchmark in benchmark/ConfigAutotuneSettings.cmake if(list_across_names) add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) target_compile_definitions(${BENCHMARK_TARGET} PRIVATE BENCHMARK_CONFIG_TUNING) add_matrix(TARGET ${BENCHMARK_TARGET} SHARDS 1 CURRENT_SHARD 0 INPUT "${BENCHMARK_TARGET}.parallel.cpp.in" OUTPUT_PATTERN "${BENCHMARK_TARGET}_${output_pattern_suffix}" NAMES ${list_across_names} LISTS ${list_across}) add_dependencies(benchmark_config_tuning ${BENCHMARK_TARGET}) else() message(WARNING "No config-tuning entry in benchmark/ConfigAutotuneSettings.cmake for ${BENCHMARK_TARGET}!") return() endif() else() #do nothing if BENCHMARK_CONFIG_TUNING is ON but no ${BENCHMARK_TARGET}.parallel.cpp.in exists return() endif() else() add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) endif() if(BUILD_NAIVE_BENCHMARK) target_compile_definitions(${BENCHMARK_TARGET} PUBLIC BUILD_NAIVE_BENCHMARK) endif() target_link_libraries(${BENCHMARK_TARGET} PRIVATE rocprim benchmark::benchmark ) if(USE_HIPCXX) target_link_libraries(${BENCHMARK_TARGET} PRIVATE $,hip::host,hip::device> ) else() target_link_libraries(${BENCHMARK_TARGET} PRIVATE hip::device) endif() target_compile_options(${BENCHMARK_TARGET} PRIVATE $<$: /bigobj # number of sections exceeded object file format limit: compile with /bigobj > ) set_target_properties(${BENCHMARK_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" ) if (ROCPRIM_INSTALL) rocm_install(TARGETS ${BENCHMARK_TARGET} COMPONENT benchmarks) endif() if (WIN32 AND NOT DEFINED DLLS_COPIED) set(DLLS_COPIED "YES") set(DLLS_COPIED ${DLLS_COPIED} PARENT_SCOPE) # for now adding in all .dll as dependency chain is not cmake based on win32 file( GLOB third_party_dlls LIST_DIRECTORIES ON CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll ${CMAKE_SOURCE_DIR}/rtest.* ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET ${BENCHMARK_TARGET} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${file_i} ${PROJECT_BINARY_DIR}/benchmark ) endforeach( file_i ) endif() endfunction() # **************************************************************************** # Benchmarks # **************************************************************************** add_rocprim_benchmark(benchmark_block_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_block_discontinuity.cpp) add_rocprim_benchmark(benchmark_block_exchange.cpp) add_rocprim_benchmark(benchmark_block_histogram.cpp) add_rocprim_benchmark(benchmark_block_radix_sort.cpp) add_rocprim_benchmark(benchmark_block_radix_rank.cpp) add_rocprim_benchmark(benchmark_block_reduce.cpp) add_rocprim_benchmark(benchmark_block_run_length_decode.cpp) add_rocprim_benchmark(benchmark_block_scan.cpp) add_rocprim_benchmark(benchmark_block_sort.cpp) add_rocprim_benchmark(benchmark_config_dispatch.cpp) add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_device_adjacent_find.cpp) add_rocprim_benchmark(benchmark_device_batch_memcpy.cpp) add_rocprim_benchmark(benchmark_device_binary_search.cpp) add_rocprim_benchmark(benchmark_device_find_first_of.cpp) add_rocprim_benchmark(benchmark_device_find_end.cpp) add_rocprim_benchmark(benchmark_device_histogram.cpp) add_rocprim_benchmark(benchmark_device_merge.cpp) add_rocprim_benchmark(benchmark_device_merge_sort.cpp) add_rocprim_benchmark(benchmark_device_merge_sort_block_sort.cpp) add_rocprim_benchmark(benchmark_device_merge_sort_block_merge.cpp) add_rocprim_benchmark(benchmark_device_nth_element.cpp) add_rocprim_benchmark(benchmark_device_partial_sort.cpp) add_rocprim_benchmark(benchmark_device_partial_sort_copy.cpp) add_rocprim_benchmark(benchmark_device_partition.cpp) add_rocprim_benchmark(benchmark_device_radix_sort.cpp) add_rocprim_benchmark(benchmark_device_radix_sort_block_sort.cpp) add_rocprim_benchmark(benchmark_device_radix_sort_onesweep.cpp) add_rocprim_benchmark(benchmark_device_reduce_by_key.cpp) add_rocprim_benchmark(benchmark_device_reduce_by_key_deterministic.cpp) add_rocprim_benchmark(benchmark_device_reduce.cpp) add_rocprim_benchmark(benchmark_device_run_length_encode.cpp) add_rocprim_benchmark(benchmark_device_run_length_encode_non_trivial_runs.cpp) add_rocprim_benchmark(benchmark_device_scan.cpp) add_rocprim_benchmark(benchmark_device_scan_deterministic.cpp) add_rocprim_benchmark(benchmark_device_scan_by_key.cpp) add_rocprim_benchmark(benchmark_device_search.cpp) add_rocprim_benchmark(benchmark_device_scan_by_key_deterministic.cpp) add_rocprim_benchmark(benchmark_device_search_n.cpp) add_rocprim_benchmark(benchmark_device_select.cpp) add_rocprim_benchmark(benchmark_device_segmented_radix_sort_keys.cpp) add_rocprim_benchmark(benchmark_device_segmented_radix_sort_pairs.cpp) add_rocprim_benchmark(benchmark_device_segmented_reduce.cpp) add_rocprim_benchmark(benchmark_device_transform.cpp) add_rocprim_benchmark(benchmark_predicate_iterator.cpp) add_rocprim_benchmark(benchmark_warp_exchange.cpp) add_rocprim_benchmark(benchmark_warp_reduce.cpp) add_rocprim_benchmark(benchmark_warp_scan.cpp) add_rocprim_benchmark(benchmark_warp_sort.cpp) add_rocprim_benchmark(benchmark_device_memory.cpp) rocPRIM-rocm-6.4.3/benchmark/ConfigAutotuneSettings.cmake000066400000000000000000000212311502235215600233440ustar00rootroot00000000000000# MIT License # # Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # All default fallback types as listed in scripts/autotune/fallback_config.json set(TUNING_TYPES "int64_t int short int8_t double float rocprim::half") # If config selection happens based on two types, the second type has limited fallbacks. The selection is based # on the size and it is ignored whether the type is floating-point or integral. The autotuning script uses the # benchmarks for the integral types as fallback, hence tuning for the floating-point types is not needed. set(LIMITED_TUNING_TYPES "int64_t int short int8_t") function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix) if(file STREQUAL "benchmark_device_adjacent_difference") set(list_across_names "DataType;Left;InPlace;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};\ true;false true;32 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_adjacent_find") set(list_across_names "InputType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@InputType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_histogram") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge_sort_block_merge") set(list_across_names "KeyType;ValueType;BlockSize;UseMergePath" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type;\ 128 256 512 1024;true" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@UseMergePath@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge_sort_block_sort") set(list_across_names "KeyType;ValueType;BlockSize;BlockSortMethod" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type;\ 256 512 1024;rocprim::block_sort_algorithm::stable_merge_sort" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@BlockSortMethod@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_radix_sort_block_sort") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_radix_sort_onesweep") set(list_across_names "KeyType;ValueType;BlockSize;RadixBits" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 128 256 512 1024;4 5 6 7 8" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@RadixBits@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_reduce") set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_scan") set(list_across_names "DataType;Algo" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@Algo@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_scan_by_key") set(list_across_names "KeyType;ValueType;Algo" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};${LIMITED_TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@Algo@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_binary_search") set(list_across_names "SubAlgorithm;ValueType;OutputType;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "\ binary_search upper_bound lower_bound;${TUNING_TYPES};${LIMITED_TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@SubAlgorithm@_@ValueType@_@OutputType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_segmented_radix_sort_keys") set(list_across_names "\ KeyType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) set(list_across "${TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) set(output_pattern_suffix "\ @KeyType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_segmented_radix_sort_pairs") set(list_across_names "\ KeyType;ValueType;LongBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) set(list_across "${TUNING_TYPES};int8_t;8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) set(output_pattern_suffix "\ @KeyType@_@ValueType@_@LongBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_transform") set(list_across_names "\ DataType;BlockSize;" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "\ @DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_partition") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_select") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_reduce_by_key") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_find_first_of") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_run_length_encode") set(list_across_names "KeyType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_run_length_encode_non_trivial_runs") set(list_across_names "KeyType;BlockSize;BlockLoadMethod" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024;block_load_vectorize block_load_warp_transpose" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@BlockSize@_@BlockLoadMethod@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) endif() endfunction() rocPRIM-rocm-6.4.3/benchmark/benchmark_block_adjacent_difference.cpp000066400000000000000000000436301502235215600255110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif namespace rp = rocprim; template __global__ __launch_bounds__(BlockSize) void kernel(Args ...args) { Benchmark::template run(args...); } struct subtract_left { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, T(123), storage); } else { adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_left_partial { template __device__ static void run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; unsigned int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, T(123), tile_size, storage); } else { adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, tile_size, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_right { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, T(123), storage); } else { adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_right_partial { template __device__ static void run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; unsigned int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; adjacent_diff_t().subtract_right_partial(input, output, rp::minus<>{}, tile_size, storage); for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; template auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) -> std::enable_if_t::value && !std::is_same::value> { // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const auto random_range = limit_random_range(0, 10); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice ) ); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, Trials ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template auto run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) -> std::enable_if_t::value || std::is_same::value> { // Calculate the number of elements N size_t N = bytes / sizeof(T); static constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const auto random_range_input = limit_random_range(0, 10); const auto random_range_tile_sizes = limit_random_range(0, items_per_block); const std::vector input = get_random_data(size, random_range_input.first, random_range_input.second, seed.get_0()); const std::vector tile_sizes = get_random_data(num_blocks, random_range_tile_sizes.first, random_range_tile_sizes.second, seed.get_1()); T* d_input; unsigned int* d_tile_sizes; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(input[0]))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_tile_sizes, tile_sizes.data(), tile_sizes.size() * sizeof(tile_sizes[0]), hipMemcpyHostToDevice ) ); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_tile_sizes, d_output, Trials ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_tile_sizes)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:adjacent_difference,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",with_tile:" #WITH_TILE "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type, block, with_tile) \ CREATE_BENCHMARK(type, block, 1, with_tile), \ CREATE_BENCHMARK(type, block, 3, with_tile), \ CREATE_BENCHMARK(type, block, 4, with_tile), \ CREATE_BENCHMARK(type, block, 8, with_tile), \ CREATE_BENCHMARK(type, block, 16, with_tile), \ CREATE_BENCHMARK(type, block, 32, with_tile) template void add_benchmarks(const std::string& name, std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(float, 256, false), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(rocprim::half, 256, false), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(double, 256, false) }; if(!std::is_same::value) { bs.insert(bs.end(), { BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(float, 256, true), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(rocprim::half, 256, true), BENCHMARK_TYPE(long long, 256, true), BENCHMARK_TYPE(double, 256, true) }); } benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks("subtract_left", benchmarks, bytes, seed, stream); add_benchmarks("subtract_right", benchmarks, bytes, seed, stream); add_benchmarks("subtract_left_partial", benchmarks, bytes, seed, stream); add_benchmarks("subtract_right_partial", benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_discontinuity.cpp000066400000000000000000000274131502235215600244740ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CmdParser #include "benchmark_utils.hpp" #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T * d_input, T * d_output) { Runner::template run(d_input, d_output); } struct flag_heads { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads(head_flags, T(123), input, rp::equal_to()); } else { bdiscontinuity.flag_heads(head_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct flag_tails { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_tails(tail_flags, T(123), input, rp::equal_to()); } else { bdiscontinuity.flag_tails(tail_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += tail_flags[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct flag_heads_and_tails { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads_and_tails(head_flags, T(123), tail_flags, T(234), input, rp::equal_to()); } else { bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; input[i] += tail_flags[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); const auto random_range = limit_random_range(0, 10); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",with_tile:" #WITH_TILE "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type, block, bool) \ CREATE_BENCHMARK(type, block, 1, bool), \ CREATE_BENCHMARK(type, block, 2, bool), \ CREATE_BENCHMARK(type, block, 3, bool), \ CREATE_BENCHMARK(type, block, 4, bool), \ CREATE_BENCHMARK(type, block, 8, bool) template void add_benchmarks(const std::string& name, std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(uint8_t, 256, false), BENCHMARK_TYPE(uint8_t, 256, true), BENCHMARK_TYPE(rocprim::half, 256, false), BENCHMARK_TYPE(rocprim::half, 256, true), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(long long, 256, true), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks("flag_heads", benchmarks, bytes, seed, stream); add_benchmarks("flag_tails", benchmarks, bytes, seed, stream); add_benchmarks("flag_heads_and_tails", benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_exchange.cpp000066400000000000000000000352421502235215600233500ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output) { Runner::template run(d_input, d_ranks, d_output); } struct blocked_to_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.blocked_to_striped(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct striped_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.striped_to_blocked(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct blocked_to_warp_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.blocked_to_warp_striped(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct warp_striped_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.warp_striped_to_blocked(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct scatter_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_ranks + block_offset, ranks); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.scatter_to_blocked(input, input, ranks); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct scatter_to_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_ranks + block_offset, ranks); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.scatter_to_striped(input, input, ranks); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input(size); // Fill input for(size_t i = 0; i < size; i++) { input[i] = T(i); } std::vector ranks(size); // Fill ranks (for scatter operations) engine_type gen(seed.get_0()); for(size_t bi = 0; bi < size / items_per_block; bi++) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } T * d_input; unsigned int * d_ranks; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_ranks), size * sizeof(unsigned int))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_ranks, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_ranks)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 7), \ CREATE_BENCHMARK(type, block, 8) template void add_benchmarks(const std::string& name, std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(rocprim::half, 256), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(custom_float2, 256), BENCHMARK_TYPE(float2, 256), BENCHMARK_TYPE(custom_double2, 256), BENCHMARK_TYPE(double2, 256), BENCHMARK_TYPE(float4, 256), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("bytes", "bytes", DEFAULT_BYTES, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("bytes"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks("blocked_to_striped", benchmarks, bytes, seed, stream); add_benchmarks("striped_to_blocked", benchmarks, bytes, seed, stream); add_benchmarks("blocked_to_warp_striped", benchmarks, bytes, seed, stream); add_benchmarks("warp_striped_to_blocked", benchmarks, bytes, seed, stream); add_benchmarks("scatter_to_blocked", benchmarks, bytes, seed, stream); add_benchmarks("scatter_to_striped", benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_histogram.cpp000066400000000000000000000220371502235215600235610ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct histogram { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize, unsigned int Trials > __device__ static void run(const T* input, T* output) { // TODO: Move global_offset into final loop const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; unsigned int global_offset = blockIdx.x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[index + k]; } using bhistogram_t = rp::block_histogram; __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t().histogram(values, histogram, storage); for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = BinSize - 1 - values[k]; } } ROCPRIM_UNROLL for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + threadIdx.x < BinSize) { output[global_offset + threadIdx.x] = histogram[offset + threadIdx.x]; global_offset += BlockSize; } } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize = BlockSize, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) { // Calculate the number of elements N size_t N = bytes / sizeof(T); // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), bin_size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T ",cfg:{bs:" #BS \ ",ipt:" #IPT ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark, \ stream, \ bytes) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t bytes) { std::vector new_benchmarks = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(unsigned long long, 256), BENCHMARK_TYPE(unsigned long long, 320) }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); // Add benchmarks std::vector benchmarks; // using_atomic using histogram_a_t = histogram; add_benchmarks(benchmarks, "using_atomic", stream, bytes); // using_sort using histogram_s_t = histogram; add_benchmarks(benchmarks, "using_sort", stream, bytes); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_radix_rank.cpp000066400000000000000000000233031502235215600237030ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif namespace rp = rocprim; template __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, unsigned int* ranks_output) { using rank_type = rp::block_radix_rank; const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rp::block_load_direct_striped(lid, keys_input + block_offset, keys); unsigned int ranks[ItemsPerThread]; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { ROCPRIM_SHARED_MEMORY typename rank_type::storage_type storage; unsigned int begin_bit = 0; const unsigned int end_bit = sizeof(T) * 8; while(begin_bit < end_bit) { const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); if ROCPRIM_IF_CONSTEXPR(Descending) { rank_type().rank_keys_desc(keys, ranks, storage, begin_bit, pass_bits); } else { rank_type().rank_keys(keys, ranks, storage, begin_bit, pass_bits); } begin_bit += RadixBits; } } rp::block_store_direct_striped(lid, ranks_output + block_offset, ranks); } template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int grid_size = ((N + items_per_block - 1) / items_per_block); const unsigned int size = items_per_block * grid_size; std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); T* d_input; unsigned int* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(unsigned int))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::steady_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(rank_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::steady_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, KIND) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T ",cfg:{bs:" #BS \ ",ipt:" #IPT ",method:" #KIND "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic), \ CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic_memoize), \ CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::match) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK_KINDS(type, block, 1), \ CREATE_BENCHMARK_KINDS(type, block, 4), \ CREATE_BENCHMARK_KINDS(type, block, 8), \ CREATE_BENCHMARK_KINDS(type, block, 12), \ CREATE_BENCHMARK_KINDS(type, block, 16), \ CREATE_BENCHMARK_KINDS(type, block, 20) // clang-format on void add_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { std::vector bs = { BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_radix_sort.cpp000066400000000000000000000307611502235215600237450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif enum class benchmark_kinds { sort_keys, sort_pairs }; namespace rp = rocprim; template using select_decomposer_t = std:: conditional_t::value, custom_type_decomposer, rp::identity_decomposer>; template __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rp::block_load_direct_striped(lid, input + block_offset, keys); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; sort.sort(keys, 0, sizeof(T) * 8, select_decomposer_t{}); } rp::block_store_direct_striped(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; rp::block_load_direct_striped(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = keys[i] + T(1); } ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; sort.sort(keys, values, 0, sizeof(T) * 8, select_decomposer_t{}); } for(unsigned int i = 0; i < ItemsPerThread; i++) { keys[i] += values[i]; } rp::block_store_direct_striped(lid, output + block_offset, keys); } template void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, size_t bytes, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); T* d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); if(benchmark_kind == benchmark_kinds::sort_keys) { hipLaunchKernelGGL( HIP_KERNEL_NAME( sort_keys_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME( sort_pairs_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, RB, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:radix_sort,key_type:" #T ",subalgo:" + name \ + ",cfg:{bs:" #BS ",rb:" #RB ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ benchmark_kind, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type, block, radix_bits) \ CREATE_BENCHMARK(type, block, radix_bits, 1), CREATE_BENCHMARK(type, block, radix_bits, 2), \ CREATE_BENCHMARK(type, block, radix_bits, 3), \ CREATE_BENCHMARK(type, block, radix_bits, 4), CREATE_BENCHMARK(type, block, radix_bits, 8) void add_benchmarks(benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { using custom_int_type = custom_type; std::vector bs = { BENCHMARK_TYPE(int, 64, 3), BENCHMARK_TYPE(int, 512, 3), BENCHMARK_TYPE(int, 64, 4), BENCHMARK_TYPE(int, 128, 4), BENCHMARK_TYPE(int, 192, 4), BENCHMARK_TYPE(int, 256, 4), BENCHMARK_TYPE(int, 320, 4), BENCHMARK_TYPE(int, 512, 4), BENCHMARK_TYPE(int8_t, 64, 3), BENCHMARK_TYPE(int8_t, 512, 3), BENCHMARK_TYPE(int8_t, 64, 4), BENCHMARK_TYPE(int8_t, 128, 4), BENCHMARK_TYPE(int8_t, 192, 4), BENCHMARK_TYPE(int8_t, 256, 4), BENCHMARK_TYPE(int8_t, 320, 4), BENCHMARK_TYPE(int8_t, 512, 4), BENCHMARK_TYPE(uint8_t, 64, 3), BENCHMARK_TYPE(uint8_t, 512, 3), BENCHMARK_TYPE(uint8_t, 64, 4), BENCHMARK_TYPE(uint8_t, 128, 4), BENCHMARK_TYPE(uint8_t, 192, 4), BENCHMARK_TYPE(uint8_t, 256, 4), BENCHMARK_TYPE(uint8_t, 320, 4), BENCHMARK_TYPE(uint8_t, 512, 4), BENCHMARK_TYPE(rocprim::half, 64, 3), BENCHMARK_TYPE(rocprim::half, 512, 3), BENCHMARK_TYPE(rocprim::half, 64, 4), BENCHMARK_TYPE(rocprim::half, 128, 4), BENCHMARK_TYPE(rocprim::half, 192, 4), BENCHMARK_TYPE(rocprim::half, 256, 4), BENCHMARK_TYPE(rocprim::half, 320, 4), BENCHMARK_TYPE(rocprim::half, 512, 4), BENCHMARK_TYPE(long long, 64, 3), BENCHMARK_TYPE(long long, 512, 3), BENCHMARK_TYPE(long long, 64, 4), BENCHMARK_TYPE(long long, 128, 4), BENCHMARK_TYPE(long long, 192, 4), BENCHMARK_TYPE(long long, 256, 4), BENCHMARK_TYPE(long long, 320, 4), BENCHMARK_TYPE(long long, 512, 4), BENCHMARK_TYPE(custom_int_type, 64, 3), BENCHMARK_TYPE(custom_int_type, 512, 3), BENCHMARK_TYPE(custom_int_type, 64, 4), BENCHMARK_TYPE(custom_int_type, 128, 4), BENCHMARK_TYPE(custom_int_type, 192, 4), BENCHMARK_TYPE(custom_int_type, 256, 4), BENCHMARK_TYPE(custom_int_type, 320, 4), BENCHMARK_TYPE(custom_int_type, 512, 4), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmark_kinds::sort_keys, "keys", benchmarks, bytes, seed, stream); add_benchmarks(benchmark_kinds::sort_pairs, "pairs", benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_reduce.cpp000066400000000000000000000231161502235215600230320ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct reduce { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; T reduced_value; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using breduce_t = rp::block_reduce; __shared__ typename breduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { breduce_t().reduce(values, reduced_value, storage); values[0] = reduced_value; } if(threadIdx.x == 0) { output[blockIdx.x] = reduced_value; } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) { // Calculate the number of elements N size_t N = bytes / sizeof(T); // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark, \ stream, \ bytes) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t bytes) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(rocprim::half, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(rocprim::half, 256), CREATE_BENCHMARK(custom_float2, 256, 1), CREATE_BENCHMARK(custom_float2, 256, 4), CREATE_BENCHMARK(custom_float2, 256, 8), CREATE_BENCHMARK(float2, 256, 1), CREATE_BENCHMARK(float2, 256, 4), CREATE_BENCHMARK(float2, 256, 8), CREATE_BENCHMARK(custom_double2, 256, 1), CREATE_BENCHMARK(custom_double2, 256, 4), CREATE_BENCHMARK(custom_double2, 256, 8), CREATE_BENCHMARK(double2, 256, 1), CREATE_BENCHMARK(double2, 256, 4), CREATE_BENCHMARK(double2, 256, 8), CREATE_BENCHMARK(float4, 256, 1), CREATE_BENCHMARK(float4, 256, 4), CREATE_BENCHMARK(float4, 256, 8), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); // Add benchmarks std::vector benchmarks; // using_warp_scan using reduce_uwr_t = reduce; add_benchmarks(benchmarks, "using_warp_reduce", stream, bytes); // reduce then scan using reduce_rr_t = reduce; add_benchmarks(benchmarks, "raking_reduce", stream, bytes); // reduce commutative only using reduce_rrco_t = reduce; add_benchmarks(benchmarks, "raking_reduce_commutative_only", stream, bytes); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_run_length_decode.cpp000066400000000000000000000247561502235215600252460ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif template __global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, const OffsetT* d_run_offsets, ItemT* d_decoded_items, bool enable_store = false) { using BlockRunLengthDecodeT = rocprim::block_run_length_decode; ItemT run_items[RunsPerThread]; OffsetT run_offsets[RunsPerThread]; const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items); rocprim::block_load_direct_blocked(global_thread_idx, d_run_offsets, run_offsets); BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); const OffsetT total_decoded_size = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; #pragma nounroll for(unsigned i = 0; i < Trials; ++i) { OffsetT decoded_window_offset = 0; while(decoded_window_offset < total_decoded_size) { ItemT decoded_items[DecodedItemsPerThread]; block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset); if(enable_store) { rocprim::block_store_direct_blocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items); } decoded_window_offset += BlockSize * DecodedItemsPerThread; } } } template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements N size_t N = bytes / sizeof(ItemT); constexpr auto runs_per_block = BlockSize * RunsPerThread; const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); std::vector run_items(num_runs); std::vector run_offsets(num_runs + 1); engine_type prng(seed.get_0()); using ItemDistribution = std::conditional_t::value, std::uniform_int_distribution, std::uniform_real_distribution>; ItemDistribution run_item_dist(0, 100); std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); for(size_t i = 0; i < num_runs; ++i) { run_items[i] = run_item_dist(prng); } for(size_t i = 1; i < num_runs + 1; ++i) { const OffsetT next_run_length = run_length_dist(prng); run_offsets[i] = run_offsets[i - 1] + next_run_length; } const OffsetT output_length = run_offsets.back(); ItemT* d_run_items{}; HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); HIP_CHECK(hipMemcpy(d_run_items, run_items.data(), run_items.size() * sizeof(ItemT), hipMemcpyHostToDevice)); OffsetT* d_run_offsets{}; HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); HIP_CHECK(hipMemcpy(d_run_offsets, run_offsets.data(), run_offsets.size() * sizeof(OffsetT), hipMemcpyHostToDevice)); ItemT* d_output{}; HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); for(auto _ : state) { auto start = std::chrono::steady_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel), dim3(num_runs / runs_per_block), dim3(BlockSize), 0, stream, d_run_items, d_run_offsets, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::steady_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); state.SetItemsProcessed(state.iterations() * output_length * Trials); HIP_CHECK(hipFree(d_run_items)); HIP_CHECK(hipFree(d_run_offsets)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:run_length_decode" \ ",item_type:" #IT ",offset_type:" #OT ",min_run_length:" #MINRL \ ",max_run_length:" #MAXRL ",cfg:{block_size:" #BS \ ",run_per_thread:" #RPT ",decoded_items_per_thread:" #DIPT "}}") \ .c_str(), \ &run_benchmark, \ bytes, \ seed, \ stream) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_scan.cpp000066400000000000000000000257321502235215600225150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct inclusive_scan { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().inclusive_scan(values, values, storage); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template struct exclusive_scan { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; using U = typename std::remove_reference::type; T values[ItemsPerThread]; U init = U(100); for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().exclusive_scan(values, values, init, storage); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) { // Calculate the number of elements N size_t N = bytes / sizeof(T); // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:scan,subalgo:" + algorithm_name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark, \ stream, \ bytes) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t bytes) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(rocprim::half, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(rocprim::half, 256), CREATE_BENCHMARK(custom_float2, 256, 1), CREATE_BENCHMARK(custom_float2, 256, 4), CREATE_BENCHMARK(custom_float2, 256, 8), CREATE_BENCHMARK(float2, 256, 1), CREATE_BENCHMARK(float2, 256, 4), CREATE_BENCHMARK(float2, 256, 8), CREATE_BENCHMARK(custom_double2, 256, 1), CREATE_BENCHMARK(custom_double2, 256, 4), CREATE_BENCHMARK(custom_double2, 256, 8), CREATE_BENCHMARK(double2, 256, 1), CREATE_BENCHMARK(double2, 256, 4), CREATE_BENCHMARK(double2, 256, 8), CREATE_BENCHMARK(float4, 256, 1), CREATE_BENCHMARK(float4, 256, 4), CREATE_BENCHMARK(float4, 256, 8), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); // Add benchmarks std::vector benchmarks; // inclusive_scan using_warp_scan using inclusive_scan_uws_t = inclusive_scan; add_benchmarks( benchmarks, "inclusive_scan", "using_warp_scan", stream, bytes ); // exclusive_scan using_warp_scan using exclusive_scan_uws_t = exclusive_scan; add_benchmarks( benchmarks, "exclusive_scan", "using_warp_scan", stream, bytes ); // inclusive_scan reduce then scan using inclusive_scan_rts_t = inclusive_scan; add_benchmarks( benchmarks, "inclusive_scan", "reduce_then_scan", stream, bytes ); // exclusive_scan reduce then scan using exclusive_scan_rts_t = exclusive_scan; add_benchmarks( benchmarks, "exclusive_scan", "reduce_then_scan", stream, bytes ); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_sort.cpp000066400000000000000000000153521502235215600225550ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_block_sort.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif #define CREATE_BENCHMARK_IPT(K, V, BS, IPT) \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); #define CREATE_BENCHMARK(K, V, BS) \ CREATE_BENCHMARK_IPT(K, V, BS, 1) \ CREATE_BENCHMARK_IPT(K, V, BS, 4) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP const hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // If we are NOT config tuning run a selection of benchmarks // Block sizes as large as possible ar most relevant #ifndef BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(float, rocprim::empty_type, 256) CREATE_BENCHMARK(double, rocprim::empty_type, 256) CREATE_BENCHMARK(rocprim::half, rocprim::empty_type, 256) CREATE_BENCHMARK(uint8_t, rocprim::empty_type, 256) CREATE_BENCHMARK(int, rocprim::empty_type, 256) CREATE_BENCHMARK(int, rocprim::empty_type, 512) CREATE_BENCHMARK(double, rocprim::empty_type, 512) CREATE_BENCHMARK(int, int, 512) CREATE_BENCHMARK(float, double, 512) CREATE_BENCHMARK(double, int64_t, 512) CREATE_BENCHMARK(rocprim::half, int16_t, 512) CREATE_BENCHMARK(uint8_t, uint32_t, 512) #endif std::vector benchmarks = {}; config_autotune_register::register_benchmark_subset(benchmarks, 0, 1, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_block_sort.parallel.hpp000066400000000000000000000271501502235215600243540ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include template::value, bool> = true> __global__ __launch_bounds__(BlockSize) void sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); rocprim::block_sort bsort; bsort.sort(keys); rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template::value, bool> = true> __global__ __launch_bounds__(BlockSize) void sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; ValueType values[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { values[item] = block_offset + lid * ItemsPerThread + item; } rocprim::block_sort bsort; bsort.sort(keys, values); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { keys[item] = keys[item] + static_cast(values[item]); } rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void stable_sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); using stable_key_type = rocprim::tuple; stable_key_type stable_keys[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { stable_keys[item] = rocprim::make_tuple(keys[item], ItemsPerThread * lid + item); } // Special comparison that preserves relative order of equal keys auto stable_compare_function = [](const stable_key_type& a, const stable_key_type& b) mutable -> bool { const bool ab = rocprim::less{}(rocprim::get<0>(a), rocprim::get<0>(b)); return ab || (!rocprim::less{}(rocprim::get<0>(b), rocprim::get<0>(a)) && (rocprim::get<1>(a) < rocprim::get<1>(b))); }; rocprim::block_sort bsort; bsort.sort(stable_keys, stable_compare_function); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { keys[item] = rocprim::get<0>(stable_keys[item]); } rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template struct block_sort_benchmark : public config_autotune_interface { private: static constexpr bool with_values = !std::is_same::value; static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; static const char* get_block_sort_method_name(rocprim::block_sort_algorithm alg) { switch(alg) { case rocprim::block_sort_algorithm::merge_sort: return "merge_sort"; case rocprim::block_sort_algorithm::stable_merge_sort: return "stable_merge_sort"; case rocprim::block_sort_algorithm::bitonic_sort: return "bitonic_sort"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } public: std::string sort_key() const override { using namespace std::string_literals; return std::string((with_values ? "_pairs"s : "_keys"s) + (stable ? "_stable"s : ""s) + pad_string(std::to_string(items_per_block), 5) + ", " + name()); } std::string name() const override { return bench_naming::format_name( "{lvl:block,algo:sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",stable:" + (stable ? "true" : "false") + ",cfg:{bs:" + std::to_string(BlockSize) + ",ipt:" + std::to_string(ItemsPerThread) + ",method:" + std::string(get_block_sort_method_name(block_sort_algorithm)) + "}}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; static constexpr bool debug_synchronous = false; static auto dispatch_block_sort(std::false_type /*stable_sort*/, size_t size, const hipStream_t stream, KeyType* d_input, KeyType* d_output) { hipLaunchKernelGGL( HIP_KERNEL_NAME( sort_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } static auto dispatch_block_sort(std::true_type /*stable_sort*/, size_t size, const hipStream_t stream, KeyType* d_input, KeyType* d_output) { hipLaunchKernelGGL(HIP_KERNEL_NAME(stable_sort_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements N size_t N = bytes / sizeof(KeyType); const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); KeyType* d_input; KeyType* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(KeyType))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(KeyType))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(KeyType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); static constexpr auto stable_tag = rocprim::detail::bool_constant{}; // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { dispatch_block_sort(stable_tag, size, stream, d_input, d_output); } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(KeyType)); state.SetItemsProcessed(state.iterations() * batch_size * size); state.counters["sorted_size"] = benchmark::Counter(BlockSize * ItemsPerThread, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } }; #endif // ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_config_dispatch.cpp000066400000000000000000000073541502235215600235430ustar00rootroot00000000000000 #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif enum class stream_kind { default_stream, per_thread_stream, explicit_stream, async_stream }; static void BM_host_target_arch(benchmark::State& state, const stream_kind stream_kind) { const hipStream_t stream = [stream_kind]() -> hipStream_t { hipStream_t stream = 0; switch(stream_kind) { case stream_kind::default_stream: return stream; case stream_kind::per_thread_stream: return hipStreamPerThread; case stream_kind::explicit_stream: HIP_CHECK(hipStreamCreate(&stream)); return stream; case stream_kind::async_stream: HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); return stream; } }(); for(auto _ : state) { rocprim::detail::target_arch target_arch; HIP_CHECK(rocprim::detail::host_target_arch(stream, target_arch)); benchmark::DoNotOptimize(target_arch); } if(stream_kind != stream_kind::default_stream && stream_kind != stream_kind::per_thread_stream) { HIP_CHECK(hipStreamDestroy(stream)); } } __global__ void empty_kernel() {} // An empty kernel launch for baseline static void BM_kernel_launch(benchmark::State& state) { static constexpr hipStream_t stream = 0; for(auto _ : state) { hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, stream); HIP_CHECK(hipGetLastError()); } HIP_CHECK(hipStreamSynchronize(stream)); } #define CREATE_BENCHMARK(ST, SK) \ benchmark::RegisterBenchmark( \ bench_naming::format_name( \ "{lvl:na" \ ",algo:" #ST \ ",cfg:default_config}" \ ).c_str(), \ &BM_host_target_arch, \ SK \ ) \ int main(int argc, char** argv) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", 100, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP std::vector benchmarks{ CREATE_BENCHMARK(default_stream, stream_kind::default_stream), CREATE_BENCHMARK(per_thread_stream, stream_kind::per_thread_stream), CREATE_BENCHMARK(explicit_stream, stream_kind::explicit_stream), CREATE_BENCHMARK(async_stream, stream_kind::async_stream), benchmark::RegisterBenchmark( bench_naming::format_name("{lvl:na,algo:empty_kernel,cfg:default_config}").c_str(), BM_kernel_launch)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_adjacent_difference.cpp000066400000000000000000000123501502235215600256510ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_adjacent_difference.parallel.hpp" #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include #include #include #ifndef DEFAULT_BYTES constexpr std::size_t DEFAULT_BYTES = 1024LL * 1024LL * 1024LL * 2LL; #endif #define CREATE_BENCHMARK(T, left, in_place) \ { \ const device_adjacent_difference_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } // clang-format off #define CREATE_BENCHMARKS(T) \ CREATE_BENCHMARK(T, true, false) \ CREATE_BENCHMARK(T, true, true) \ CREATE_BENCHMARK(T, false, false) \ CREATE_BENCHMARK(T, false, true) // clang-format on int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "size in bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP const hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, seed, stream); #else // BENCHMARK_CONFIG_TUNING using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks CREATE_BENCHMARKS(int) CREATE_BENCHMARKS(std::int64_t) CREATE_BENCHMARKS(uint8_t) CREATE_BENCHMARKS(rocprim::half) CREATE_BENCHMARKS(float) CREATE_BENCHMARKS(double) CREATE_BENCHMARKS(custom_float2) CREATE_BENCHMARKS(custom_double2) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in000066400000000000000000000027411502235215600300540ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_adjacent_difference.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_adjacent_difference_benchmark_generator< @DataType@, @BlockSize@, @Left@, @InPlace@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_adjacent_difference.parallel.hpp000066400000000000000000000270671502235215600274640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include template std::string config_name() { //const rocprim::adjacent_difference_config = Config(); auto config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_adjacent_difference_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:adjacent_difference" + (Left ? ""s : "_right"s) + (InPlace ? "_inplace"s : ""s) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) const { return ::rocprim::adjacent_difference(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) const { return ::rocprim::adjacent_difference_right(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) const { return ::rocprim::adjacent_difference_inplace(temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) const { return ::rocprim::adjacent_difference_right_inplace(temporary_storage, storage_size, input, std::forward(args)...); } void run(benchmark::State& state, const std::size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using output_type = T; static constexpr bool debug_synchronous = false; // Generate data const size_t size = bytes / sizeof(T); const auto random_range = limit_random_range(1, 100); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T* d_input; output_type* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); if(!InPlace) { HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); } static constexpr auto left_tag = rocprim::detail::bool_constant{}; static constexpr auto in_place_tag = rocprim::detail::bool_constant{}; // Allocate temporary storage std::size_t temp_storage_size; void* d_temp_storage = nullptr; const auto launch = [&] { return dispatch_adjacent_difference(left_tag, in_place_tag, d_temp_storage, temp_storage_size, d_input, d_output, size, rocprim::plus<>{}, stream, debug_synchronous); }; HIP_CHECK(launch()); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(launch()); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); if(!InPlace) { HIP_CHECK(hipFree(d_output)); } HIP_CHECK(hipFree(d_temp_storage)); } }; template struct device_adjacent_difference_benchmark_generator { static constexpr unsigned int min_items_per_thread = 0; static constexpr unsigned int max_items_per_thread_arg = TUNING_SHARED_MEMORY_MAX / (BlockSize * sizeof(T) * 2 + sizeof(T)); template struct create_ipt { // Device Adjacent difference uses block_load/store_transpose to coalesc memory transaction to global memory // However it accesses shared memory with a stride of items per thread, which leads to reduced performance if power // of two is used for small types. Experiments shown that primes are the best choice for performance. static constexpr int primes[] = {1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; static constexpr uint ipt_num = primes[IptValueIndex]; using generated_config = rocprim::adjacent_difference_config; void operator()(std::vector>& storage) { if(ipt_num < max_items_per_thread_arg) { storage.emplace_back( std::make_unique>()); } } }; static void create(std::vector>& storage) { static constexpr unsigned int max_items_per_thread = rocprim::Log2::VALUE; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_adjacent_find.cpp000066400000000000000000000131521502235215600245000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_adjacent_find.parallel.hpp" #include "benchmark_utils.hpp" #include "cmdparser.hpp" // gbench #include // HIP #include // C++ Standard Library #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = size_t{2} << 30; // 2 GiB #endif #define CREATE_BENCHMARK(T, P) \ { \ const device_adjacent_find_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } #define CREATE_ADJACENT_FIND_BENCHMARKS(T) \ CREATE_BENCHMARK(T, 1) \ CREATE_BENCHMARK(T, 5) \ CREATE_BENCHMARK(T, 9) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of input bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, seed, stream); #else // BENCHMARK_CONFIG_TUNING \ // add_adjacent_find_benchmarks(benchmarks, size, seed, stream); using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; // Tuned types CREATE_ADJACENT_FIND_BENCHMARKS(int8_t) CREATE_ADJACENT_FIND_BENCHMARKS(int16_t) CREATE_ADJACENT_FIND_BENCHMARKS(int32_t) CREATE_ADJACENT_FIND_BENCHMARKS(int64_t) CREATE_ADJACENT_FIND_BENCHMARKS(rocprim::half) CREATE_ADJACENT_FIND_BENCHMARKS(float) CREATE_ADJACENT_FIND_BENCHMARKS(double) // Custom types CREATE_ADJACENT_FIND_BENCHMARKS(custom_float2) CREATE_ADJACENT_FIND_BENCHMARKS(custom_double2) CREATE_ADJACENT_FIND_BENCHMARKS(custom_int2) CREATE_ADJACENT_FIND_BENCHMARKS(custom_char_double) CREATE_ADJACENT_FIND_BENCHMARKS(custom_longlong_double) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_adjacent_find.parallel.cpp.in000066400000000000000000000026311502235215600267000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_adjacent_find.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_adjacent_find_benchmark_generator< @InputType@, @BlockSize@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_adjacent_find.parallel.hpp000066400000000000000000000221241502235215600262770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_ADJACENT_FIND_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_ADJACENT_FIND_PARALLEL_HPP_ #include "benchmark_utils.hpp" // gbench #include // HIP #include // rocPRIM #include #include #include // C++ Standard Library #include #include #include #include #include #include template std::string config_name() { auto config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_adjacent_find_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:adjacent_find,input_type:" + std::string(Traits::name()) + ",first_adj_pos:" + std::to_string(FirstAdjPosDecimal * 0.1f) + ",cfg:" + config_name() + "}"); } static constexpr size_t warmup_size = 5; static constexpr size_t batch_size = 10; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using input_type = InputT; using output_type = std::size_t; const size_t size = bytes / sizeof(input_type); // Get index of the first adjacent equal pair std::size_t first_adj_index = static_cast(size * FirstAdjPosDecimal * 0.1f); if(first_adj_index >= size - 1) { first_adj_index = size - 2; } // Generate data ensuring there is no adjacent pair before first_adj_index std::vector input(size); if(std::is_same::value) { // For int8_t that has a very limited range of values, iota initialization // seems to give a more reliable benchmark input std::iota(input.begin(), input.end(), 0); } else { input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector iota(size); std::iota(iota.begin(), iota.end(), 0); std::transform(iota.begin() + 1, iota.begin() + first_adj_index + 1, input.begin() + 1, [&](std::size_t& idx) { while(input[idx] == input[idx - 1]) { input[idx] = get_random_value( generate_limits::min(), generate_limits::max(), seed.get_0()); } return input[idx]; }); } // Insert first adjacent pair input[first_adj_index] = input[first_adj_index + 1]; input_type* d_input; output_type* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(*d_input), hipMemcpyHostToDevice)); std::size_t tmp_storage_size; void* d_tmp_storage = nullptr; auto launch_adjacent_find = [&]() { HIP_CHECK(::rocprim::adjacent_find(d_tmp_storage, tmp_storage_size, d_input, d_output, size, rocprim::equal_to{}, stream, false)); }; // Get size of tmporary storage launch_adjacent_find(); HIP_CHECK(hipMalloc(&d_tmp_storage, tmp_storage_size)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { launch_adjacent_find(); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { launch_adjacent_find(); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * first_adj_index * sizeof(*d_input)); state.SetItemsProcessed(state.iterations() * batch_size * first_adj_index); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_tmp_storage)); } }; template struct device_adjacent_find_benchmark_generator { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread_arg = TUNING_SHARED_MEMORY_MAX / (BlockSize * sizeof(InputT) * 2); template struct create_pos { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExp; using generated_config = rocprim::adjacent_find_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique>()); } }; void operator()(std::vector>& storage) { static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each< make_index_range, create_ipt>(storage); } }; static void create(std::vector>& storage) { static_for_each, create_pos>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_FIND_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_batch_memcpy.cpp000066400000000000000000000703721502235215600243710ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include #include // rocPRIM #include #include #include #include #include #include #include #include #include #include constexpr uint32_t warmup_size = 5; constexpr int32_t max_size = 1024 * 1024; constexpr int32_t wlev_min_size = rocprim::batch_memcpy_config<>::wlev_size_threshold; constexpr int32_t blev_min_size = rocprim::batch_memcpy_config<>::blev_size_threshold; // Used for generating offsets. We generate a permutation map and then derive // offsets via a sum scan over the sizes in the order of the permutation. This // allows us to keep the order of buffers we pass to batch_memcpy, but still // have source and destinations mappings not be the identity function: // // batch_memcpy( // [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d // └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! // ───┬─── ─────┬───── ───┬─── ─── // └─────────┼─────────┼───┐ // ┌───┘ ┌───┘ │ what batch_memcpy does // ▼ ▼ ▼ // ─── ─────────── ─────── ─────── // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ template std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) { const auto n = input.size(); assert(n > 0); std::vector result(n); std::vector permute(n); std::iota(permute.begin(), permute.end(), 0); std::shuffle(permute.begin(), permute.end(), rng); for(T i = 0, sum = 0; i < n; ++i) { result[permute[i]] = sum; sum += input[permute[i]]; } return result; } using offset_type = size_t; template::type = 0> void init_input(ContainerMemCpy& h_input_for_memcpy, ContainerCopy& /*h_input_for_copy*/, std::mt19937_64& rng, offset_type total_num_bytes) { std::independent_bits_engine bits_engine{rng}; const size_t num_ints = rocprim::detail::ceiling_div(total_num_bytes, sizeof(uint64_t)); h_input_for_memcpy = std::vector(num_ints * sizeof(uint64_t)); // generate_n for uninitialized memory, pragmatically use placement-new, since there are no // uint64_t objects alive yet in the storage. std::for_each( reinterpret_cast(h_input_for_memcpy.data()), reinterpret_cast(h_input_for_memcpy.data() + num_ints * sizeof(uint64_t)), [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); } template::type = 0> void init_input(ContainerMemCpy& /*h_input_for_memcpy*/, ContainerCopy& h_input_for_copy, std::mt19937_64& rng, byte_offset_type total_num_bytes) { using value_type = typename ContainerCopy::value_type; std::independent_bits_engine bits_engine{rng}; const size_t num_ints = rocprim::detail::ceiling_div(total_num_bytes, sizeof(uint64_t)); const size_t num_of_elements = rocprim::detail::ceiling_div(num_ints * sizeof(uint64_t), sizeof(value_type)); h_input_for_copy = std::vector(num_of_elements); // generate_n for uninitialized memory, pragmatically use placement-new, since there are no // uint64_t objects alive yet in the storage. std::for_each(reinterpret_cast(h_input_for_copy.data()), reinterpret_cast(h_input_for_copy.data()) + num_ints, [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); } template::type = 0> void batch_copy(void* temporary_storage, size_t& storage_size, InputBufferItType sources, OutputBufferItType destinations, BufferSizeItType sizes, uint32_t num_copies, hipStream_t stream) { HIP_CHECK(rocprim::batch_memcpy(temporary_storage, storage_size, sources, destinations, sizes, num_copies, stream)); } template::type = 0> void batch_copy(void* temporary_storage, size_t& storage_size, InputBufferItType sources, OutputBufferItType destinations, BufferSizeItType sizes, uint32_t num_copies, hipStream_t stream) { HIP_CHECK(rocprim::batch_copy(temporary_storage, storage_size, sources, destinations, sizes, num_copies, stream)); } template struct BatchMemcpyData { size_t total_num_elements = 0; ValueType* d_input = nullptr; ValueType* d_output = nullptr; ValueType** d_buffer_srcs = nullptr; ValueType** d_buffer_dsts = nullptr; BufferSizeType* d_buffer_sizes = nullptr; BatchMemcpyData() = default; BatchMemcpyData(const BatchMemcpyData&) = delete; BatchMemcpyData(BatchMemcpyData&& other) : total_num_elements{std::exchange(other.total_num_elements, 0)} , d_input{std::exchange(other.d_input, nullptr)} , d_output{std::exchange(other.d_output, nullptr)} , d_buffer_srcs{std::exchange(other.d_buffer_srcs, nullptr)} , d_buffer_dsts{std::exchange(other.d_buffer_dsts, nullptr)} , d_buffer_sizes{std::exchange(other.d_buffer_sizes, nullptr)} {} BatchMemcpyData& operator=(BatchMemcpyData&& other) { total_num_elements = std::exchange(other.total_num_elements, 0); d_input = std::exchange(other.d_input, nullptr); d_output = std::exchange(other.d_output, nullptr); d_buffer_srcs = std::exchange(other.d_buffer_srcs, nullptr); d_buffer_dsts = std::exchange(other.d_buffer_dsts, nullptr); d_buffer_sizes = std::exchange(other.d_buffer_sizes, nullptr); return *this; }; BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; size_t total_num_bytes() const { return total_num_elements * sizeof(ValueType); } ~BatchMemcpyData() { HIP_CHECK(hipFree(d_buffer_sizes)); HIP_CHECK(hipFree(d_buffer_srcs)); HIP_CHECK(hipFree(d_buffer_dsts)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_input)); } }; template BatchMemcpyData prepare_data(const managed_seed& seed, const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const bool shuffle_buffers = false; BatchMemcpyData result; const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; constexpr int32_t wlev_min_elems = rocprim::detail::ceiling_div(wlev_min_size, sizeof(ValueType)); constexpr int32_t blev_min_elems = rocprim::detail::ceiling_div(blev_min_size, sizeof(ValueType)); constexpr int32_t max_elems = max_size / sizeof(ValueType); // Generate data std::mt19937_64 rng(seed.get_0()); // Number of elements in each buffer. std::vector h_buffer_num_elements(num_buffers); auto iter = h_buffer_num_elements.begin(); iter = generate_random_data_n(iter, num_tlev_buffers, 1, wlev_min_elems - 1, rng); iter = generate_random_data_n(iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); iter = generate_random_data_n(iter, num_blev_buffers, blev_min_elems, max_elems, rng); // Shuffle the sizes so that size classes aren't clustered std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); // Get the byte size of each buffer std::vector h_buffer_num_bytes(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); } result.total_num_elements = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); std::vector h_input_for_memcpy; std::vector h_input_for_copy; init_input(h_input_for_memcpy, h_input_for_copy, rng, result.total_num_elements * sizeof(ValueType)); HIP_CHECK(hipMalloc(&result.d_input, result.total_num_bytes())); HIP_CHECK(hipMalloc(&result.d_output, result.total_num_bytes())); HIP_CHECK(hipMalloc(&result.d_buffer_srcs, num_buffers * sizeof(ValueType*))); HIP_CHECK(hipMalloc(&result.d_buffer_dsts, num_buffers * sizeof(ValueType*))); HIP_CHECK(hipMalloc(&result.d_buffer_sizes, num_buffers * sizeof(BufferSizeType))); // Generate the source and shuffled destination offsets. std::vector src_offsets; std::vector dst_offsets; if(shuffle_buffers) { src_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); dst_offsets = shuffled_exclusive_scan(h_buffer_num_elements, rng); } else { src_offsets = std::vector(num_buffers); dst_offsets = std::vector(num_buffers); // Consecutive offsets (no shuffling). // src/dst offsets first element is 0, so skip that! std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); } // Generate the source and destination pointers. std::vector h_buffer_srcs(num_buffers); std::vector h_buffer_dsts(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_srcs[i] = result.d_input + src_offsets[i]; h_buffer_dsts[i] = result.d_output + dst_offsets[i]; } // Prepare the batch memcpy. if(IsMemCpy) { HIP_CHECK(hipMemcpy(result.d_input, h_input_for_memcpy.data(), result.total_num_bytes(), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_bytes.data(), h_buffer_num_bytes.size() * sizeof(BufferSizeType), hipMemcpyHostToDevice)); } else { HIP_CHECK(hipMemcpy(result.d_input, h_input_for_copy.data(), result.total_num_bytes(), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_sizes, h_buffer_num_elements.data(), h_buffer_num_elements.size() * sizeof(BufferSizeType), hipMemcpyHostToDevice)); } HIP_CHECK(hipMemcpy(result.d_buffer_srcs, h_buffer_srcs.data(), h_buffer_srcs.size() * sizeof(ValueType*), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(result.d_buffer_dsts, h_buffer_dsts.data(), h_buffer_dsts.size() * sizeof(ValueType*), hipMemcpyHostToDevice)); return result; } template void run_benchmark(benchmark::State& state, const managed_seed& seed, hipStream_t stream, const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; size_t temp_storage_bytes = 0; BatchMemcpyData data; batch_copy(nullptr, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream); void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); data = prepare_data(seed, num_tlev_buffers, num_wlev_buffers, num_blev_buffers); // Warm-up for(size_t i = 0; i < warmup_size; i++) { batch_copy(d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); batch_copy(d_temp_storage, temp_storage_bytes, data.d_buffer_srcs, data.d_buffer_dsts, data.d_buffer_sizes, num_buffers, stream); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); state.SetItemsProcessed(state.iterations() * data.total_num_elements); HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); HIP_CHECK(hipFree(d_temp_storage)); } // Naive implementation used for comparison #ifdef BUILD_NAIVE_BENCHMARK template __launch_bounds__(BlockSize) __global__ void naive_kernel(void** in_ptr, void** out_ptr, const OffsetType* sizes) { using underlying_type = unsigned char; constexpr int32_t items_per_thread = 4; constexpr int32_t tile_size = items_per_thread * BlockSize; const int32_t buffer_id = rocprim::flat_block_id(); auto in = reinterpret_cast(in_ptr[buffer_id]); auto out = reinterpret_cast(out_ptr[buffer_id]); const auto size = sizes[buffer_id]; const auto size_in_elements = size / sizeof(underlying_type); const auto tiles = size_in_elements / tile_size; auto num_items_to_copy = size; for(size_t i = 0; i < tiles; ++i) { underlying_type data[items_per_thread]; rocprim::block_load_direct_blocked(rocprim::flat_block_thread_id(), in, data, num_items_to_copy); rocprim::block_store_direct_blocked(rocprim::flat_block_thread_id(), out, data, num_items_to_copy); in += tile_size; out += tile_size; num_items_to_copy -= tile_size; } } template void run_naive_benchmark(benchmark::State& state, const managed_seed& seed, hipStream_t stream, const int32_t num_tlev_buffers = 1024, const int32_t num_wlev_buffers = 1024, const int32_t num_blev_buffers = 1024) { const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; const auto data = prepare_data(seed, num_tlev_buffers, num_wlev_buffers, num_blev_buffers); // Warm-up for(size_t i = 0; i < warmup_size; i++) { naive_kernel <<>>((void**)data.d_buffer_srcs, (void**)data.d_buffer_dsts, data.d_buffer_sizes); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); naive_kernel <<>>((void**)data.d_buffer_srcs, (void**)data.d_buffer_dsts, data.d_buffer_sizes); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } state.SetBytesProcessed(state.iterations() * data.total_num_bytes()); state.SetItemsProcessed(state.iterations() * data.total_num_elements); HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); } #define CREATE_NAIVE_BENCHMARK(item_size, \ item_alignment, \ size_type, \ num_tlev, \ num_wlev, \ num_blev) \ benchmark::RegisterBenchmark( \ bench_naming::format_name( \ "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ ",size_type:" #size_type ",algo:naive_memcpy,num_tlev:" #num_tlev \ ",num_wlev:" #num_wlev ",num_blev:" #num_blev ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_naive_benchmark, \ size_type, \ true>(state, seed, stream, num_tlev, num_wlev, num_blev); \ }) #endif // BUILD_NAIVE_BENCHMARK #define CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,item_size:" #item_size \ ",item_alignment:" #item_alignment ",size_type:" #size_type \ ",algo:batch_memcpy,num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ ",num_blev:" #num_blev ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_benchmark, size_type, true>( \ state, \ seed, \ stream, \ num_tlev, \ num_wlev, \ num_blev); \ run_benchmark, size_type, false>( \ state, \ seed, \ stream, \ num_tlev, \ num_wlev, \ num_blev); \ }) #ifndef BUILD_NAIVE_BENCHMARK #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) #else #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ CREATE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000), \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0), \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0), \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000), \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) #endif //BUILD_NAIVE_BENCHMARK int32_t main(int32_t argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", 1024, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int32_t trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = hipStreamDefault; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; benchmarks = {BENCHMARK_TYPE(1, 1), BENCHMARK_TYPE(1, 2), BENCHMARK_TYPE(1, 4), BENCHMARK_TYPE(1, 8), BENCHMARK_TYPE(2, 2), BENCHMARK_TYPE(4, 4), BENCHMARK_TYPE(8, 8)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_binary_search.cpp000066400000000000000000000271231502235215600245430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_binary_search.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t haystack_bytes, const managed_seed& seed, hipStream_t stream, size_t needles_bytes, bool sorted_needles) { using haystack_type = T; using needle_type = T; using output_type = size_t; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; // Calculate the number of elements from byte size size_t haystack_size = haystack_bytes / sizeof(haystack_type); size_t needles_size = needles_bytes / sizeof(needle_type); compare_op_type compare_op; // Generate data std::vector haystack(haystack_size); std::iota(haystack.begin(), haystack.end(), 0); const auto random_range = limit_random_range(0, haystack_size); std::vector needles = get_random_data(needles_size, random_range.first, random_range.second, seed.get_0()); if(sorted_needles) { std::sort(needles.begin(), needles.end(), compare_op); } haystack_type * d_haystack; needle_type * d_needles; output_type * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_haystack), haystack_size * sizeof(haystack_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_needles), needles_size * sizeof(needle_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), needles_size * sizeof(output_type))); HIP_CHECK( hipMemcpy( d_haystack, haystack.data(), haystack_size * sizeof(haystack_type), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_needles, needles.data(), needles_size * sizeof(needle_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes; auto dispatch_helper = dispatch_binary_search_helper(); HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{}, d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{}, d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{}, d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * needles_size * sizeof(needle_type)); state.SetItemsProcessed(state.iterations() * batch_size * needles_size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_haystack)); HIP_CHECK(hipFree(d_needles)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, K, SORTED, ALGO_TAG) \ benchmark::RegisterBenchmark( \ bench_naming::format_name( \ "{lvl:device,algo:" + ALGO_TAG{}.name() + ",key_type:" #T ",subalgo:" #K "_percent_" \ + std::string(SORTED ? "sorted" : "random") + "_needles,cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_benchmark(state, bytes, seed, stream, bytes * K / 100, SORTED); }) #define BENCHMARK_ALGORITHMS(T, K, SORTED) \ CREATE_BENCHMARK(T, K, SORTED, binary_search_subalgorithm), \ CREATE_BENCHMARK(T, K, SORTED, lower_bound_subalgorithm), \ CREATE_BENCHMARK(T, K, SORTED, upper_bound_subalgorithm) #define BENCHMARK_TYPE(type) \ BENCHMARK_ALGORITHMS(type, 10, true), BENCHMARK_ALGORITHMS(type, 10, false) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks std::vector benchmarks; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING benchmarks = {BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_TYPE(custom_float2), BENCHMARK_TYPE(custom_double2)}; #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_binary_search.parallel.cpp.in000066400000000000000000000027201502235215600267370ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_binary_search.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create>>(); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_binary_search.parallel.hpp000066400000000000000000000223101502235215600263340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_BINARY_SEARCH_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_BINARY_SEARCH_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include #include #include #include #include #include #include struct binary_search_subalgorithm { std::string name() const { return "binary_search"; } }; struct lower_bound_subalgorithm { std::string name() const { return "lower_bound"; } }; struct upper_bound_subalgorithm { std::string name() const { return "upper_bound"; } }; template struct dispatch_binary_search_helper { template hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) { using config = rocprim::binary_search_config; return rocprim::binary_search(std::forward(args)...); } template hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) { using config = rocprim::upper_bound_config; return rocprim::upper_bound(std::forward(args)...); } template hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) { using config = rocprim::lower_bound_config; return rocprim::lower_bound(std::forward(args)...); } }; template<> struct dispatch_binary_search_helper { template hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) { return rocprim::binary_search(std::forward(args)...); } template hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) { return rocprim::upper_bound(std::forward(args)...); } template hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) { return rocprim::lower_bound(std::forward(args)...); } }; template struct device_binary_search_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:" + SubAlgorithm{}.name() + ",value_type:" + std::string(Traits::name()) + ",output_type:" + std::string(Traits::name()) + ",cfg:{bs:" + std::to_string(Config::block_size) + ",ipt:" + std::to_string(Config::items_per_thread) + "}}"); } void run(benchmark::State& state, size_t haystack_size, const managed_seed& seed, hipStream_t stream) const override { using compare_op_t = rocprim::less; const auto needles_size = haystack_size / 10; compare_op_t compare_op; std::vector haystack(haystack_size); std::iota(haystack.begin(), haystack.end(), 0); const auto random_range = limit_random_range(0, haystack_size); std::vector needles = get_random_data(needles_size, random_range.first, random_range.second, seed.get_0()); T* d_haystack; T* d_needles; OutputType* d_output; HIP_CHECK(hipMalloc(&d_haystack, haystack_size * sizeof(*d_haystack))); HIP_CHECK(hipMalloc(&d_needles, needles_size * sizeof(*d_needles))); HIP_CHECK(hipMalloc(&d_output, needles_size * sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_haystack, haystack.data(), haystack_size * sizeof(*d_haystack), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_needles, needles.data(), needles_size * sizeof(*d_needles), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; auto dispatch_helper = dispatch_binary_search_helper(); HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream)); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * needles_size * sizeof(T)); state.SetItemsProcessed(state.iterations() * needles_size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_haystack)); HIP_CHECK(hipFree(d_needles)); HIP_CHECK(hipFree(d_output)); } }; #endif // ROCPRIM_BENCHMARK_BINARY_SEARCH_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_find_end.cpp000066400000000000000000000111271502235215600234750ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_find_end.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK_FIND_END(TYPE, KEY_SIZE, REPEATING) \ { \ const device_find_end_benchmark instance(KEY_SIZE, REPEATING); \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK_PATTERN(TYPE, REPEATING) \ { \ CREATE_BENCHMARK_FIND_END(TYPE, 10, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 100, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 1000, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 10000, REPEATING) \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PATTERN(TYPE, true) \ CREATE_BENCHMARK_PATTERN(TYPE, false) \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("bytes", "bytes", DEFAULT_BYTES, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("bytes"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_find_end.hpp000066400000000000000000000164551502235215600235130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_END_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_FIND_END_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include template struct device_find_end_benchmark : public config_autotune_interface { size_t key_size_ = 10; bool repeating_ = false; device_find_end_benchmark(size_t KeySize, bool repeating) { key_size_ = KeySize; repeating_ = repeating; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:find_end,value_pattern:" + (repeating_ ? "repeating"s : "random"s) + ",key_size:" + std::to_string(key_size_) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using key_type = Key; using output_type = size_t; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t key_size = std::min(size, key_size_); // Generate data std::vector keys_input = get_random_data(key_size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector input(size); if(repeating_) { // Repeating similar pattern without early exits. keys_input[0] = 0; for(size_t i = 0; i < size; i++) { input[i] = keys_input[i % key_size]; } keys_input[0] = 1; } else { input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0() + 1); } key_type* d_keys_input; key_type* d_input; output_type* d_output; HIP_CHECK(hipMalloc(&d_keys_input, key_size * sizeof(*d_keys_input))); HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), key_size * sizeof(*d_keys_input), hipMemcpyHostToDevice)); rocprim::equal_to compare_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::find_end(d_temporary_storage, temporary_storage_bytes, d_input, d_keys_input, d_output, size, key_size, compare_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rocprim::find_end(d_temporary_storage, temporary_storage_bytes, d_input, d_keys_input, d_output, size, key_size, compare_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::find_end(d_temporary_storage, temporary_storage_bytes, d_input, d_keys_input, d_output, size, key_size, compare_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(*d_input)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_FIND_END_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_find_first_of.cpp000066400000000000000000000130671502235215600245470ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_find_first_of.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES constexpr size_t DEFAULT_BYTES = size_t{1} << 27; // 128 MiB #endif #define CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, FIRST_OCCURENCE) \ { \ const device_find_first_of_benchmark instance(KEYS_SIZE, FIRST_OCCURENCE); \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } // clang-format off #define CREATE_BENCHMARK0(TYPE, KEYS_SIZE) \ { \ CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.1) \ CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.5) \ CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 1.0) \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK0(TYPE, 1) \ CREATE_BENCHMARK0(TYPE, 10) \ CREATE_BENCHMARK0(TYPE, 100) \ CREATE_BENCHMARK0(TYPE, 1000) \ CREATE_BENCHMARK0(TYPE, 10000) \ } // clang-format on int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, seed, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(int16_t) CREATE_BENCHMARK(int32_t) CREATE_BENCHMARK(float) CREATE_BENCHMARK(int64_t) CREATE_BENCHMARK(double) using custom_int2 = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_longlong_double) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_find_first_of.parallel.cpp.in000066400000000000000000000026111502235215600267400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_find_first_of.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_find_first_of_benchmark_generator<@DataType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_find_first_of.parallel.hpp000066400000000000000000000253401502235215600263440ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include template std::string config_name() { const rocprim::detail::find_first_of_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_find_first_of_benchmark : public config_autotune_interface { std::vector keys_sizes; std::vector first_occurrences; device_find_first_of_benchmark(size_t keys_size, double first_occurrence) { keys_sizes.push_back(keys_size); first_occurrences.push_back(first_occurrence); } device_find_first_of_benchmark(const std::vector& keys_sizes, const std::vector& first_occurrences) { this->keys_sizes = keys_sizes; this->first_occurrences = first_occurrences; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:find_first_of,"s + (keys_sizes.size() == 1 ? "keys_size:"s + std::to_string(keys_sizes[0]) : ""s) + (first_occurrences.size() == 1 ? ",first_occurrence:"s + std::to_string(first_occurrences[0]) : ""s) + ",value_type:"s + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 2; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using type = T; using key_type = T; using output_type = size_t; const size_t size = bytes / sizeof(type); const size_t max_keys_size = *std::max_element(keys_sizes.begin(), keys_sizes.end()); // Generate data std::vector key_input = get_random_data(max_keys_size, 0, 100, seed.get_0()); std::vector input = get_random_data(size, 101, generate_limits::max(), seed.get_0()); std::vector d_inputs(first_occurrences.size()); for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpyAsync(d_input, input.data(), input.size() * sizeof(*d_input), hipMemcpyHostToDevice, stream)); // Set the first occurrence of keys in input const size_t p = static_cast(size * first_occurrences[fi]); if(p < size) { const type key = key_input[0]; HIP_CHECK(hipMemcpyAsync(d_input + p, &key, sizeof(*d_input), hipMemcpyHostToDevice, stream)); } d_inputs[fi] = d_input; } key_type* d_key_input; output_type* d_output; HIP_CHECK(hipMalloc(&d_key_input, max_keys_size * sizeof(*d_key_input))); HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_key_input, key_input.data(), key_input.size() * sizeof(*d_key_input), hipMemcpyHostToDevice)); ::rocprim::equal_to compare_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; auto run = [&](size_t key_size, const type* d_input) { HIP_CHECK(rocprim::find_first_of(d_temporary_storage, temporary_storage_bytes, d_input, d_key_input, d_output, input.size(), key_size, compare_op, stream)); }; size_t max_temporary_storage_bytes = 0; for(size_t keys_size : keys_sizes) { run(keys_size, d_inputs[0]); max_temporary_storage_bytes = std::max(max_temporary_storage_bytes, temporary_storage_bytes); } temporary_storage_bytes = max_temporary_storage_bytes; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { for(size_t keys_size : keys_sizes) { run(keys_size, d_inputs[fi]); } } } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { for(size_t keys_size : keys_sizes) { run(keys_size, d_inputs[fi]); } } } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); // Only a part of data (before the first occurrence) must be actually processed. In ideal // cases when no thread blocks do unneeded work (i.e. exit early once the match is found), // performance for different values of first_occurrence must be similar. size_t sum_effective_size = 0; for(double first_occurrence : first_occurrences) { sum_effective_size += static_cast(size * first_occurrence); } size_t sum_keys_size = 0; for(size_t keys_size : keys_sizes) { sum_keys_size += keys_size; } state.SetBytesProcessed(state.iterations() * batch_size * sum_effective_size * sizeof(*d_inputs[0])); state.SetItemsProcessed(state.iterations() * batch_size * sum_effective_size); // Each input is read once but all keys are read by all threads so performance is likely // compute-bound or bound by cache bandwidth for reading keys rather than reading inputs. // Let's additionally report the rate of comparisons to see if it reaches a plateau with // increasing keys_size. state.counters["comparisons_per_second"] = benchmark::Counter(static_cast(state.iterations() * batch_size * sum_effective_size * sum_keys_size), benchmark::Counter::kIsRate); for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { HIP_CHECK(hipFree(d_inputs[fi])); } HIP_CHECK(hipFree(d_key_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temporary_storage)); } }; template struct device_find_first_of_benchmark_generator { template struct create_ipt { using generated_config = rocprim::find_first_of_config; void operator()(std::vector>& storage) { std::vector keys_sizes{1, 10, 100, 1000}; std::vector first_occurrences{0.1, 0.5, 1.0}; storage.emplace_back( std::make_unique>( keys_sizes, first_occurrences)); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread = 16; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_histogram.cpp000066400000000000000000001001421502235215600237200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_histogram.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include // rocPRIM #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; int get_entropy_percents(int entropy_reduction) { switch(entropy_reduction) { case 0: return 100; case 1: return 81; case 2: return 54; case 3: return 33; case 4: return 20; default: return 0; } } const int entropy_reductions[] = {0, 2, 4, 6}; template void run_even_benchmark(benchmark::State& state, size_t bytes, const managed_seed&, hipStream_t stream, size_t bins, size_t scale, int entropy_reduction) { // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; const level_type lower_level = 0; const level_type upper_level = bins * scale; // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); T* d_input; counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, lower_level, upper_level, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, lower_level, upper_level, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, lower_level, upper_level, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_even_benchmark(benchmark::State& state, size_t bytes, const managed_seed&, hipStream_t stream, size_t bins, size_t scale, int entropy_reduction) { // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; unsigned int num_levels[ActiveChannels]; level_type lower_level[ActiveChannels]; level_type upper_level[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { lower_level[channel] = 0; upper_level[channel] = bins * scale; num_levels[channel] = bins + 1; } // Generate data std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); T* d_input; counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, lower_level, upper_level, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, lower_level, upper_level, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, lower_level, upper_level, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_histogram[channel])); } } template void run_range_benchmark( benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream, size_t bins) { // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; // Generate data const auto random_range = limit_random_range(0, bins); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); std::vector levels(bins + 1); for(size_t i = 0; i < levels.size(); i++) { levels[i] = static_cast(i); } T* d_input; level_type* d_levels; counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(level_type))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(level_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, d_levels, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, d_levels, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, d_levels, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_levels)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_range_benchmark( benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream, size_t bins) { // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; const int num_levels_channel = bins + 1; unsigned int num_levels[ActiveChannels]; std::vector levels[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { levels[channel].resize(num_levels_channel); for(size_t i = 0; i < levels[channel].size(); i++) { levels[channel][i] = static_cast(i); } num_levels[channel] = num_levels_channel; } // Generate data const auto random_range = limit_random_range(0, bins); std::vector input = get_random_data(size * Channels, random_range.first, random_range.second, seed.get_0()); T* d_input; level_type* d_levels[ActiveChannels]; counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(level_type))); HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); } HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMemcpy(d_levels[channel], levels[channel].data(), num_levels_channel * sizeof(level_type), hipMemcpyHostToDevice)); } void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((rp::multi_histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, d_levels, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::multi_histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, d_levels, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rp::multi_histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, d_levels, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_levels[channel])); HIP_CHECK(hipFree(d_histogram[channel])); } } #define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ VECTOR.push_back(benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:histogram_even,value_type:" #T ",entropy:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + ",bins:" + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_even_benchmark(state, bytes, seed, stream, BINS, SCALE, entropy_reduction); })); #define BENCHMARK_EVEN_TYPE(VECTOR, T, S) \ CREATE_EVEN_BENCHMARK(VECTOR, T, 10, S); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 100, S); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, S); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 10000, S); void add_even_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { for(int entropy_reduction : entropy_reductions) { BENCHMARK_EVEN_TYPE(benchmarks, long long, 12345); BENCHMARK_EVEN_TYPE(benchmarks, int, 1234); BENCHMARK_EVEN_TYPE(benchmarks, short, 5); CREATE_EVEN_BENCHMARK(benchmarks, unsigned char, 16, 16); CREATE_EVEN_BENCHMARK(benchmarks, unsigned char, 256, 1); BENCHMARK_EVEN_TYPE(benchmarks, double, 1234); BENCHMARK_EVEN_TYPE(benchmarks, float, 1234); BENCHMARK_EVEN_TYPE(benchmarks, rocprim::half, 5); }; } #define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:multi_histogram_even,value_type:" #T \ ",channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS \ ",entropy:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + ",bins:" + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_multi_even_benchmark(state, \ bytes, \ seed, \ stream, \ BINS, \ SCALE, \ entropy_reduction); \ }) // clang-format off #define BENCHMARK_MULTI_EVEN_TYPE(C, A, T, S) \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 10, S), \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 100, S), \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 1000, S), \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 10000, S) // clang-format on void add_multi_even_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { for(int entropy_reduction : entropy_reductions) { std::vector bs = { BENCHMARK_MULTI_EVEN_TYPE(4, 4, int, 1234), BENCHMARK_MULTI_EVEN_TYPE(4, 3, short, 5), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 16), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), BENCHMARK_MULTI_EVEN_TYPE(3, 3, float, 1234), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); }; } #define CREATE_RANGE_BENCHMARK(T, BINS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:histogram_range,value_type:" #T ",bins:" \ + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) { run_range_benchmark(state, bytes, seed, stream, BINS); }) // clang-format off #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10), \ CREATE_RANGE_BENCHMARK(T, 100), \ CREATE_RANGE_BENCHMARK(T, 1000), \ CREATE_RANGE_BENCHMARK(T, 10000) // clang-format on void add_range_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { std::vector bs = { BENCHMARK_RANGE_TYPE(long long), BENCHMARK_RANGE_TYPE(int), BENCHMARK_RANGE_TYPE(short), CREATE_RANGE_BENCHMARK(unsigned char, 16), CREATE_RANGE_BENCHMARK(unsigned char, 256), BENCHMARK_RANGE_TYPE(double), BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(rocprim::half), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:multi_histogram_range,value_type:" #T \ ",channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS \ ",bins:" \ + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) { \ run_multi_range_benchmark(state, \ bytes, \ seed, \ stream, \ BINS); \ }) // clang-format off #define BENCHMARK_MULTI_RANGE_TYPE(C, A, T) \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 10), \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 100), \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 1000), \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 10000) // clang-format on void add_multi_range_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { std::vector bs = { BENCHMARK_MULTI_RANGE_TYPE(4, 4, int), BENCHMARK_MULTI_RANGE_TYPE(4, 3, short), CREATE_MULTI_RANGE_BENCHMARK(4, 3, unsigned char, 16), CREATE_MULTI_RANGE_BENCHMARK(4, 3, unsigned char, 256), BENCHMARK_MULTI_RANGE_TYPE(3, 3, float), BENCHMARK_MULTI_RANGE_TYPE(2, 2, double), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING add_even_benchmarks(benchmarks, bytes, seed, stream); add_multi_even_benchmarks(benchmarks, bytes, seed, stream); add_range_benchmarks(benchmarks, bytes, seed, stream); add_multi_range_benchmarks(benchmarks, bytes, seed, stream); #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_histogram.parallel.cpp.in000066400000000000000000000026201502235215600261220ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_histogram.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_histogram_benchmark_generator<@DataType@, @BlockSize@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_histogram.parallel.hpp000066400000000000000000000364301502235215600255300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include template std::vector generate(size_t size, int entropy_reduction, int lower_level, int upper_level) { if(entropy_reduction >= 5) { return std::vector(size, static_cast((lower_level + upper_level) / 2)); } const size_t max_random_size = 1024 * 1024 + 4321; const unsigned int seed = 123; engine_type gen(seed); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { // Reduce enthropy by applying bitwise AND to random bits // "An Improved Supercomputer Sorting Benchmark", 1992 // Kurt Thearling & Stephen Smith auto v = gen(); for(int e = 0; e < entropy_reduction; e++) { v &= gen(); } return T(lower_level + v % (upper_level - lower_level)); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } // Cache for input data when multiple cases must be benchmarked with various configurations and // same inputs can be used for consecutive benchmarks. // It must be used as a singleton. class input_cache { public: ~input_cache() { clear(); } void clear() { for(auto& i : cache) { HIP_CHECK(hipFree(i.second)); } cache.clear(); } // The function returns an exisitng buffer if main_key matches and there is additional_key // in the cache or generates a new buffer using gen(). // If main_key does not match, it frees all device buffers and resets the cache. template T* get_or_generate(const std::string& main_key, const std::string& additional_key, size_t size, F gen) { if(this->main_key != main_key) { // The main key (for example, data type) has been changed, clear the cache clear(); this->main_key = main_key; } auto result = cache.find(additional_key); if(result != cache.end()) { return reinterpret_cast(result->second); } // Generate a new buffer std::vector data = gen(); T* d_buffer; HIP_CHECK(hipMalloc(&d_buffer, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_buffer, data.data(), size * sizeof(T), hipMemcpyHostToDevice)); cache[additional_key] = d_buffer; return d_buffer; } static input_cache& instance() { static input_cache instance; return instance; } private: std::string main_key; std::map cache; }; template std::string config_name() { const rocprim::detail::histogram_config_params config = Config(); return "{bs:" + std::to_string(config.histogram_config.block_size) + ",ipt:" + std::to_string(config.histogram_config.items_per_thread) + ",max_grid_size:" + std::to_string(config.max_grid_size) + ",shared_impl_max_bins:" + std::to_string(config.shared_impl_max_bins) + ",shared_impl_histograms:" + std::to_string(config.shared_impl_histograms) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_histogram_benchmark : public config_autotune_interface { std::vector cases; device_histogram_benchmark(const std::vector& cases) : cases(cases) {} std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:histogram,value_type:" + std::string(Traits::name()) + ",channels:" + std::to_string(Channels) + ",active_channels:" + std::to_string(ActiveChannels) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 3; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t full_size, const managed_seed&, hipStream_t stream) const override { using counter_type = unsigned int; using level_type = typename std:: conditional_t::value && sizeof(T) < sizeof(int), int, T>; struct case_data { level_type lower_level[ActiveChannels]; level_type upper_level[ActiveChannels]; unsigned int num_levels[ActiveChannels]; T* d_input; }; const std::size_t size = full_size / Channels; size_t temporary_storage_bytes = 0; void* d_temporary_storage = nullptr; counter_type* d_histogram[ActiveChannels]; unsigned int max_bins = 0; std::vector cases_data; for(auto& bins : cases) { for(int entropy_reduction : {0, 2, 4, 6}) { case_data data; // Reuse inputs for the same sample type. This autotune uses multipe inputs for all // combinations of bins and entropy, but the inputs do not depend on autotuned // params (bs, ipt, shared_impl_max_bins) and can be reused saving time needed for // generating and copying to device. data.d_input = input_cache::instance().get_or_generate( std::string(Traits::name()), std::to_string(bins) + "_" + std::to_string(entropy_reduction), full_size, [&]() { return generate(full_size, entropy_reduction, 0, bins); }); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { data.lower_level[channel] = 0; data.upper_level[channel] = bins; data.num_levels[channel] = bins + 1; } cases_data.push_back(data); size_t current_temporary_storage_bytes = 0; HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage, current_temporary_storage_bytes, data.d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); temporary_storage_bytes = std::max(temporary_storage_bytes, current_temporary_storage_bytes); max_bins = std::max(max_bins, bins); } } HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], max_bins * sizeof(counter_type))); } HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { for(auto& data : cases_data) { HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage, temporary_storage_bytes, data.d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); } } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(auto& data : cases_data) { for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage, temporary_storage_bytes, data.d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); } } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * cases_data.size() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * cases_data.size() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_histogram[channel])); } } }; template struct device_histogram_benchmark_generator { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread = 16; static constexpr unsigned int min_shared_impl_histograms = 2; static constexpr unsigned int max_shared_impl_histograms = 4; template struct create_ipt { template struct create_shared_impl_histograms { using generated_config = rocprim::histogram_config, 2048, 2048, SharedImplHistograms>; template auto create(std::vector>& storage, const std::vector& cases) -> typename std::enable_if<(items_per_thread * Channels <= max_items_per_thread), void>::type { storage.emplace_back( std::make_unique< device_histogram_benchmark>( cases)); } template auto create(std::vector>& /*storage*/, const std::vector& /*cases*/) -> typename std::enable_if::type {} void operator()(std::vector>& storage, const std::vector& cases) { // Tune histograms for single-channel data (histogram_even) create<1, 1>(storage, cases); // and some multi-channel configurations (multi_histogram_even) create<2, 2>(storage, cases); create<3, 3>(storage, cases); create<4, 4>(storage, cases); create<4, 3>(storage, cases); } }; void operator()(std::vector>& storage, const std::vector& cases) { static_for_each, create_shared_impl_histograms>(storage, cases); } }; static void create(std::vector>& storage) { // Benchmark multiple cases (with various sample distributions) and use sum of all cases // as a measurement for autotuning std::vector cases; if(std::is_same::value) { cases = {16, 127}; } else { cases = { 10, 100, 1000, 10000 // Multiple bins to trigger a global memory implementation }; } static_for_each, create_ipt>(storage, cases); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_memory.cpp000066400000000000000000001555671502235215600232600ustar00rootroot00000000000000// MIT License // // Copyright (c) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // rocPRIM #include #include #include #include #include #include enum memory_operation_method { block_primitives_transpose, striped, vectorized, block_primitive_direct, }; enum kernel_operation { no_operation, block_scan, custom_operation, atomics_no_collision, atomics_inter_block_collision, atomics_inter_warp_collision, }; template< kernel_operation Operation, class T, unsigned int ItemsPerThread, unsigned int BlockSize = 0 > struct operation; // no operation template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&)[ItemsPerThread], void* = nullptr, unsigned int = 0, T* = nullptr) const { // No operation } }; #define repeats 30 // custom operation template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) const { (void) shared_storage; (void) shared_storage_size; (void) global_mem_output; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; ROCPRIM_UNROLL for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); } } } }; // block scan template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) const { (void) global_mem_output; using block_scan_type = typename rocprim::block_scan< T, BlockSize, rocprim::block_scan_algorithm::using_warp_scan>; block_scan_type bscan; // when using vectorized or striped functions // NOTE: This is not safe but it is the easiest way to prevent code repetition if(shared_storage == nullptr || shared_storage_size < sizeof(typename block_scan_type::storage_type)) { __shared__ typename block_scan_type::storage_type storage; shared_storage = &storage; } bscan.inclusive_scan( input, input, *(reinterpret_cast(shared_storage)) ); __syncthreads(); } }; // atomics_no_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) input; unsigned int index = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) input; unsigned int index = (threadIdx.x % rocprim::arch::wavefront::min_size()) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) input; unsigned int index = threadIdx.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // block_primitive_direct method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; using block_load_type = typename rocprim::block_load< T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_direct>; using block_store_type = typename rocprim::block_store< T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_direct>; block_load_type load; block_store_type store; __shared__ union { typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); __syncthreads(); op(items, &storage, sizeof(storage), output); store.store(output + offset, items, storage.store); } // vectorized method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; rocprim::block_load_direct_blocked_vectorized (threadIdx.x, input + offset, items); __syncthreads(); op(items, nullptr, 0, output); rocprim::block_store_direct_blocked_vectorized (threadIdx.x, output + offset, items); } // striped method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T items[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, items); op(items, nullptr, 0, output); rocprim::block_store_direct_striped(lid, output + block_offset, items); } // block_primitives_transpose method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; using block_load_type = typename rocprim::block_load< T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_transpose>; using block_store_type = typename rocprim::block_store< T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_transpose>; block_load_type load; block_store_type store; __shared__ union { typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); __syncthreads(); op(items, &storage, sizeof(storage), output); store.store(output + offset, items, storage.store); } template void run_benchmark(benchmark::State& state, size_t size, const managed_seed& seed, const hipStream_t stream) { const size_t grid_size = size / (BlockSize * ItemsPerThread); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); operation selected_operation; // Warm-up for(size_t i = 0; i < 10; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template void run_benchmark_memcpy(benchmark::State& state, size_t size, const managed_seed&, const hipStream_t stream) { // Allocate device buffers // Note: since this benchmark only tests performance by memcpying between device buffers, // we don't really need to transfer data into these from the host - whatever happens // to be in device memory will do. T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:memory,subalgo:" #METHOD \ ",operation:" #OPERATION ",key_type:" #T ",size:" #SIZE \ ",cfg:{bs:" #BLOCK_SIZE ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ SIZE, \ seed, \ stream) #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:memory,subalgo:copy,key_type:" #T \ ",size:" #SIZE ",cfg:default_config}") \ .c_str(), \ run_benchmark_memcpy, \ SIZE, \ seed, \ stream) template constexpr unsigned int megabytes(unsigned int size) { return(size * (1024 * 1024 / sizeof(T))); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = { // simple memory copy not running kernel CREATE_BENCHMARK_MEMCPY(int, megabytes(128)), // simple memory copy CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 4), // simple memory copy using vector type CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 8), // simple memory copy using striped CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 8), // block_scan CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 32), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 4), // vectorized - block_scan CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 4), // custom_op CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 1024, 2), // block_primitives_transpose - atomics no collision CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 8), // block_primitives_transpose - atomics inter block collision CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 8), // block_primitives_transpose - atomics inter warp collision CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 8), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge.cpp000066400000000000000000000147511502235215600230340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_MERGE_KEYS_BENCHMARK(Key) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:merge,key_type:" #Key ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_merge_keys_benchmark(state, bytes, seed, stream); }) #define CREATE_MERGE_PAIRS_BENCHMARK(Key, Value) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:merge,key_type:" #Key ",value_type:" #Value \ ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_merge_pairs_benchmark(state, bytes, seed, stream); }) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING using custom_int2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(int, int) CREATE_BENCHMARK(long long, long long) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(custom_int2, custom_int2) CREATE_BENCHMARK(custom_double2, custom_double2) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge.parallel.cpp.in000066400000000000000000000026151502235215600252300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_merge.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_merge_benchmark_generator<@KeyType@, @ValueType@, @BlockSize@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge.parallel.hpp000066400000000000000000000427051502235215600246340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_MERGE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_MERGE_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include #include // rocPRIM HIP API #include #include #include #include #include #include #include #include namespace rp = rocprim; template std::string config_name() { const rocprim::detail::merge_config_params params = Config(); return "{bs:" + std::to_string(params.kernel_config.block_size) + ",ipt:" + std::to_string(params.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:merge,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = KeyType; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; size_t size = bytes / sizeof(key_type); const size_t size1 = size / 2; const size_t size2 = size - size1; compare_op_type compare_op; // Generate data const auto random_range = limit_random_range(0, size); std::vector keys_input1 = get_random_data(size1, random_range.first, random_range.second, seed.get_0()); std::vector keys_input2 = get_random_data(size2, random_range.first, random_range.second, seed.get_1()); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); key_type* d_keys_input1; key_type* d_keys_input2; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input1), size1 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input2), size2 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input1, keys_input1.data(), size1 * sizeof(key_type), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_keys_input2, keys_input2.data(), size2 * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge(d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, size1, size2, compare_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::merge(d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, size1, size2, compare_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::merge(d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, size1, size2, compare_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input1)); HIP_CHECK(hipFree(d_keys_input2)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = KeyType; using value_type = ValueType; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; size_t size = bytes / sizeof(key_type); const size_t size1 = size / 2; const size_t size2 = size - size1; compare_op_type compare_op; // Generate data const auto random_range = limit_random_range(0, size); std::vector keys_input1 = get_random_data(size1, random_range.first, random_range.second, seed.get_0()); std::vector keys_input2 = get_random_data(size2, random_range.first, random_range.second, seed.get_1()); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); std::vector values_input1(size1); std::vector values_input2(size2); std::iota(values_input1.begin(), values_input1.end(), 0); std::iota(values_input2.begin(), values_input2.end(), size1); key_type* d_keys_input1; key_type* d_keys_input2; key_type* d_keys_output; value_type* d_values_input1; value_type* d_values_input2; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input1), size1 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input2), size2 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMalloc(reinterpret_cast(&d_values_input1), size1 * sizeof(value_type))); HIP_CHECK( hipMalloc(reinterpret_cast(&d_values_input2), size2 * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_keys_input1, keys_input1.data(), size1 * sizeof(key_type), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_keys_input2, keys_input2.data(), size2 * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge(d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, d_values_input1, d_values_input2, d_values_output, size1, size2, compare_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::merge(d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, d_values_input1, d_values_input2, d_values_output, size1, size2, compare_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::merge(d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, d_values_input1, d_values_input2, d_values_output, size1, size2, compare_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input1)); HIP_CHECK(hipFree(d_keys_input2)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input1)); HIP_CHECK(hipFree(d_values_input2)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { do_run(state, bytes, seed, stream); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_merge_benchmark_generator { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::merge_config; using benchmark_struct = device_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; struct create_default_config { using default_config = typename rocprim::detail::default_merge_config_base::type; using benchmark_struct = device_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread_exponent = 0u; // Very large block sizes don't work with large items_per_thread since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(KeyType) + sizeof(ValueType); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; create_default_config()(storage); static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_MERGE_PARALLEL_HPP_rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort.cpp000066400000000000000000000105401502235215600240730ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; // used by ssbk benchmark using custom_longlong_double = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort.hpp000066400000000000000000000271671502235215600241150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include namespace rp = rocprim; template struct device_merge_sort_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:merge_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { do_run(state, bytes, seed, stream); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort_block_merge.cpp000066400000000000000000000130171502235215600264260ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort_block_merge.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_sort_block_merge_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort_block_merge.parallel.cpp.in000066400000000000000000000027071502235215600306320ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_merge_sort_block_merge.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_merge_sort_block_merge_benchmark_generator<@BlockSize@, @UseMergePath@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort_block_merge.parallel.hpp000066400000000000000000000466471502235215600302450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include namespace rp = rocprim; template std::string config_name() { const rocprim::detail::merge_sort_block_merge_config_params config = Config(); return "{oddeven_bs:" + std::to_string(config.merge_oddeven_config.block_size) + ",oddeven_ipt:" + std::to_string(config.merge_oddeven_config.items_per_thread) + ",oddeven_size_limit:" + std::to_string(config.merge_oddeven_config.size_limit) + ",mergepath_partition_bs:" + std::to_string(config.merge_mergepath_partition_config.block_size) + ",mergepath_bs:" + std::to_string(config.merge_mergepath_config.block_size) + ",mergepath_ipt:" + std::to_string(config.merge_mergepath_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_sort_block_merge_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:merge_sort_block_merge,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // Because merge_sort_block_merge expects partially sorted input: using block_sort_config = rocprim::default_config; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); ::rocprim::less lesser_op; rocprim::empty_type* values_ptr = nullptr; // Merge_sort_block_merge algorithm expects partially sorted input: unsigned int sorted_block_size; HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_input, values_ptr, values_ptr, size, sorted_block_size, lesser_op, stream, false)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, values_ptr, size, sorted_block_size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); hipError_t err; // Warm-up for(size_t i = 0; i < warmup_size; i++) { err = rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, values_ptr, size, sorted_block_size, lesser_op, stream, false); } if(err == hipError_t::hipErrorAssert) { state.SkipWithError("SKIPPING: block_sort_items_per_block >= " "block_merge_items_per_block does not hold"); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); return; } else if(err != hipSuccess) { std::cout << "HIP error: " << err << " line: " << __LINE__ << std::endl; exit(err); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipMemcpyAsync(d_keys, d_keys_input, size * sizeof(key_type), hipMemcpyDeviceToDevice, stream)); HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, values_ptr, size, sorted_block_size, lesser_op, stream, false)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; key_type* d_keys; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); ::rocprim::less lesser_op; // Merge_sort_block_merge algorithm expects partially sorted input: unsigned int sorted_block_size; HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_input, d_values_input, d_values_input, size, sorted_block_size, lesser_op, stream, false)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, d_values, size, sorted_block_size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); hipError_t err; // Warm-up for(size_t i = 0; i < warmup_size; i++) { err = rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, d_values, size, sorted_block_size, lesser_op, stream, false); } if(err == hipError_t::hipErrorAssert) { state.SkipWithError("SKIPPING: block_sort_items_per_block >= " "block_merge_items_per_block does not hold"); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values)); return; } else if(err != hipSuccess) { std::cout << "HIP error: " << err << " line: " << __LINE__ << std::endl; exit(err); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipMemcpyAsync(d_keys, d_keys_input, size * sizeof(key_type), hipMemcpyDeviceToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_values, d_values_input, size * sizeof(key_type), hipMemcpyDeviceToDevice, stream)); HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, d_values, size, sorted_block_size, lesser_op, stream, false)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values)); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { do_run(state, bytes, seed, stream); } }; template struct device_merge_sort_block_merge_benchmark_generator { static constexpr unsigned int get_limit() { return use_mergepath ? 0 : UINT32_MAX; } template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::detail::merge_sort_block_merge_config; using benchmark_struct = device_merge_sort_block_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread_exponent = 0u; // Very large block sizes don't work with large items_per_thread since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(Key) + sizeof(Value); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort_block_sort.cpp000066400000000000000000000131641502235215600263210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_sort_block_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; using custom_char_short = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) CREATE_BENCHMARK(int, custom_char_short) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort_block_sort.parallel.cpp.in000066400000000000000000000027101502235215600305140ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_merge_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_merge_sort_block_sort_benchmark_generator<@BlockSize@, @BlockSortMethod@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp000066400000000000000000000341431502235215600301210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include namespace rp = rocprim; constexpr const char* get_block_sort_method_name(rocprim::block_sort_algorithm alg) { switch(alg) { case rocprim::block_sort_algorithm::merge_sort: return "merge_sort"; case rocprim::block_sort_algorithm::bitonic_sort: return "bitonic_sort"; case rocprim::block_sort_algorithm::stable_merge_sort: return "stable_merge_sort"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::merge_sort_block_sort_config_params config = Config(); return "{bs:" + std::to_string(config.block_sort_config.block_size) + ",ipt:" + std::to_string(config.block_sort_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_sort_block_sort_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:merge_sort_block_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; rocprim::empty_type* values_ptr = nullptr; unsigned int items_per_block; // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; unsigned int items_per_block; HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { do_run(state, bytes, seed, stream); } }; template struct device_merge_sort_block_sort_benchmark_generator { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::detail::merge_sort_block_sort_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_merge_sort_block_sort_benchmark>()); } }; static void create(std::vector>& storage) { // Sort_items_per_block must be equal or larger than merge_items_per_block, so make // the items_per_thread at least as large so the sort_items_per_block // would be atleast 1024. static constexpr unsigned int min_items_per_thread_exponent = rocprim::Log2<(1024 / BlockSize)>::VALUE; // Very large block sizes don't work with large items_per_blocks since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = std::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_nth_element.cpp000066400000000000000000000103351502235215600242310ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_nth_element.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK_NTH_ELEMENT(TYPE, SMALL_N) \ { \ const device_nth_element_benchmark instance(SMALL_N); \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_NTH_ELEMENT(TYPE, true) \ CREATE_BENCHMARK_NTH_ELEMENT(TYPE, false) \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_nth_element.hpp000066400000000000000000000143631502235215600242430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_NTH_ELEMENT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_NTH_ELEMENT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include template struct device_nth_element_benchmark : public config_autotune_interface { bool small_n = false; device_nth_element_benchmark(bool SmallN) { small_n = SmallN; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:nth_element,nth:" + (small_n ? "small"s : "large"s) + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t nth = 10; if(!small_n) { nth = size / 2; } // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(*d_keys_input))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(*d_keys_output))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(*d_keys_input), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::nth_element(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, nth, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rocprim::nth_element(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, nth, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::nth_element(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, nth, size, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(*d_keys_input)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_NTH_ELEMENT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partial_sort.cpp000066400000000000000000000103421502235215600244300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partial_sort.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK_PARTIAL_SORT(TYPE, SMALL_N) \ { \ const device_partial_sort_benchmark instance(SMALL_N); \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PARTIAL_SORT(TYPE, true) \ CREATE_BENCHMARK_PARTIAL_SORT(TYPE, false) \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partial_sort.hpp000066400000000000000000000153641502235215600244460ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include template struct device_partial_sort_benchmark : public config_autotune_interface { bool small_n = false; device_partial_sort_benchmark(bool SmallN) { small_n = SmallN; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partial_sort,nth:" + (small_n ? "small"s : "half"s) + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t middle = 10; if(!small_n) { middle = size / 2; } // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_new_data; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(*d_keys_input))); HIP_CHECK(hipMalloc(&d_keys_new_data, size * sizeof(*d_keys_new_data))); HIP_CHECK(hipMemcpy(d_keys_new_data, keys_input.data(), size * sizeof(*d_keys_input), hipMemcpyHostToDevice)); rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::partial_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, middle, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipMemcpy(d_keys_input, d_keys_new_data, size * sizeof(*d_keys_input), hipMemcpyDeviceToDevice)); HIP_CHECK(rocprim::partial_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, middle, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { float elapsed_mseconds = 0; for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipMemcpy(d_keys_input, d_keys_new_data, size * sizeof(*d_keys_input), hipMemcpyDeviceToDevice)); // Record start event HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(rocprim::partial_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, middle, size, lesser_op, stream, false)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds_current; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds_current, start, stop)); elapsed_mseconds += elapsed_mseconds_current; } state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(*d_keys_input)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_new_data)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partial_sort_copy.cpp000066400000000000000000000104131502235215600254610ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partial_sort_copy.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK_PARTIAL_SORT_COPY(TYPE, SMALL_N) \ { \ const device_partial_sort_copy_benchmark instance(SMALL_N); \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PARTIAL_SORT_COPY(TYPE, true) \ CREATE_BENCHMARK_PARTIAL_SORT_COPY(TYPE, false) \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partial_sort_copy.hpp000066400000000000000000000147471502235215600255040ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_COPY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_COPY_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include template struct device_partial_sort_copy_benchmark : public config_autotune_interface { bool small_n = false; device_partial_sort_copy_benchmark(bool SmallN) { small_n = SmallN; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partial_sort_copy,nth:" + (small_n ? "small"s : "half"s) + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t middle = 10; if(!small_n) { middle = size / 2; } // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(*d_keys_input))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(*d_keys_output))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(*d_keys_input), hipMemcpyHostToDevice)); rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::partial_sort_copy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, middle, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rocprim::partial_sort_copy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, middle, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::partial_sort_copy(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, middle, size, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(*d_keys_input)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_COPY_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partition.cpp000066400000000000000000000241261502235215600237430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partition.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_PARTITION_FLAG_BENCHMARK(T, F, p) \ { \ const device_partition_flag_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_PARTITION_PREDICATE_BENCHMARK(T, p) \ { \ const device_partition_predicate_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(T, F, p) \ { \ const device_partition_two_way_flag_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(T, p) \ { \ const device_partition_two_way_predicate_benchmark \ instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_PARTITION_THREE_WAY_BENCHMARK(T, p) \ { \ const device_partition_three_way_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define BENCHMARK_FLAG_TYPE(type, flag_type) \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p005); \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p025); \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p050); \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p075) #define BENCHMARK_PREDICATE_TYPE(type) \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p005); \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p025); \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p050); \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p075) #define BENCHMARK_TWO_WAY_FLAG_TYPE(type, flag_type) \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p005); \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p025); \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p050); \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p075) #define BENCHMARK_TWO_WAY_PREDICATE_TYPE(type) \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p005); \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p025); \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p050); \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p075) #define BENCHMARK_THREE_WAY_TYPE(type) \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p005_p025); \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p025_p050); \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p050_p075); \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p075_p100) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else using custom_double2 = custom_type; using custom_int_double = custom_type; BENCHMARK_FLAG_TYPE(int, unsigned char); BENCHMARK_FLAG_TYPE(float, unsigned char); BENCHMARK_FLAG_TYPE(double, unsigned char); BENCHMARK_FLAG_TYPE(uint8_t, uint8_t); BENCHMARK_FLAG_TYPE(int8_t, int8_t); BENCHMARK_FLAG_TYPE(rocprim::half, int8_t); BENCHMARK_FLAG_TYPE(custom_double2, unsigned char); BENCHMARK_PREDICATE_TYPE(int); BENCHMARK_PREDICATE_TYPE(float); BENCHMARK_PREDICATE_TYPE(double); BENCHMARK_PREDICATE_TYPE(uint8_t); BENCHMARK_PREDICATE_TYPE(int8_t); BENCHMARK_PREDICATE_TYPE(rocprim::half); BENCHMARK_PREDICATE_TYPE(custom_int_double); BENCHMARK_TWO_WAY_FLAG_TYPE(int, unsigned char); BENCHMARK_TWO_WAY_FLAG_TYPE(float, unsigned char); BENCHMARK_TWO_WAY_FLAG_TYPE(double, unsigned char); BENCHMARK_TWO_WAY_FLAG_TYPE(uint8_t, uint8_t); BENCHMARK_TWO_WAY_FLAG_TYPE(int8_t, int8_t); BENCHMARK_TWO_WAY_FLAG_TYPE(rocprim::half, int8_t); BENCHMARK_TWO_WAY_FLAG_TYPE(custom_double2, unsigned char); BENCHMARK_TWO_WAY_PREDICATE_TYPE(int); BENCHMARK_TWO_WAY_PREDICATE_TYPE(float); BENCHMARK_TWO_WAY_PREDICATE_TYPE(double); BENCHMARK_TWO_WAY_PREDICATE_TYPE(uint8_t); BENCHMARK_TWO_WAY_PREDICATE_TYPE(int8_t); BENCHMARK_TWO_WAY_PREDICATE_TYPE(rocprim::half); BENCHMARK_TWO_WAY_PREDICATE_TYPE(custom_int_double); BENCHMARK_THREE_WAY_TYPE(int); BENCHMARK_THREE_WAY_TYPE(float); BENCHMARK_THREE_WAY_TYPE(double); BENCHMARK_THREE_WAY_TYPE(uint8_t); BENCHMARK_THREE_WAY_TYPE(int8_t); BENCHMARK_THREE_WAY_TYPE(rocprim::half); BENCHMARK_THREE_WAY_TYPE(custom_int_double); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partition.parallel.cpp.in000066400000000000000000000026251502235215600261430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_partition.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_partition_benchmark_generator<@DataType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_partition.parallel.hpp000066400000000000000000001010251502235215600255350ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_PARTITION_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_PARTITION_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include #include #include #include #include #include #include #include #include #include enum class partition_probability { p005, p025, p050, p075, tuning }; inline float get_probability(partition_probability probability) { switch(probability) { case partition_probability::p005: return 0.05f; case partition_probability::p025: return 0.25f; case partition_probability::p050: return 0.50f; case partition_probability::p075: return 0.75f; case partition_probability::tuning: return 0.0f; // not used } return 0.0f; } inline const char* get_probability_name(partition_probability probability) { switch(probability) { case partition_probability::p005: return "0.05"; case partition_probability::p025: return "0.25"; case partition_probability::p050: return "0.50"; case partition_probability::p075: return "0.75"; case partition_probability::tuning: return "tuning"; } return "invalid"; } enum class partition_three_way_probability { p005_p025, p025_p050, p050_p075, p075_p100, tuning }; inline std::pair get_probability(partition_three_way_probability probability) { switch(probability) { case partition_three_way_probability::p005_p025: return std::make_pair(0.05f, 0.25f); case partition_three_way_probability::p025_p050: return std::make_pair(0.25f, 0.50f); case partition_three_way_probability::p050_p075: return std::make_pair(0.50f, 0.75f); case partition_three_way_probability::p075_p100: return std::make_pair(0.75f, 1.00f); case partition_three_way_probability::tuning: return std::make_pair(0.00f, 0.00f); // not used } return std::make_pair(0.00f, 0.00f); } inline const char* get_probability_name(partition_three_way_probability probability) { switch(probability) { case partition_three_way_probability::p005_p025: return "0.05:0.25"; case partition_three_way_probability::p025_p050: return "0.25:0.50"; case partition_three_way_probability::p050_p075: return "0.50:0.75"; case partition_three_way_probability::p075_p100: return "0.75:1.00"; case partition_three_way_probability::tuning: return "tuning"; } return "invalid"; } constexpr int warmup_iter = 5; constexpr int batch_size = 10; template struct device_partition_flag_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:partition,subalgo:flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, const hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } DataType* d_input{}; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); FlagType* d_flags_0{}; FlagType* d_flags_1{}; FlagType* d_flags_2{}; HIP_CHECK(hipMalloc(&d_flags_0, size * sizeof(*d_flags_0))); HIP_CHECK( hipMemcpy(d_flags_0, flags_0.data(), size * sizeof(*d_flags_0), hipMemcpyHostToDevice)); if(is_tuning) { HIP_CHECK(hipMalloc(&d_flags_1, size * sizeof(*d_flags_1))); HIP_CHECK(hipMemcpy(d_flags_1, flags_1.data(), size * sizeof(*d_flags_1), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_flags_2, size * sizeof(*d_flags_2))); HIP_CHECK(hipMemcpy(d_flags_2, flags_2.data(), size * sizeof(*d_flags_2), hipMemcpyHostToDevice)); } DataType* d_output{}; HIP_CHECK(hipMalloc(&d_output, size * sizeof(*d_output))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](FlagType* d_flags) { HIP_CHECK(rocprim::partition(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, size, stream)); }; dispatch_flags(d_flags_0); if(is_tuning) { dispatch_flags(d_flags_1); dispatch_flags(d_flags_2); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); if(is_tuning) { HIP_CHECK(hipFree(d_flags_2)); HIP_CHECK(hipFree(d_flags_1)); } HIP_CHECK(hipFree(d_flags_0)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_predicate_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:partition,subalgo:predicate,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, const hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); DataType* d_input{}; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); DataType* d_output{}; HIP_CHECK(hipMalloc(&d_output, size * sizeof(*d_output))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](float probability) { auto predicate = [probability](const DataType& value) -> bool { return value < static_cast(127 * probability); }; HIP_CHECK(rocprim::partition(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, size, predicate, stream)); }; if(is_tuning) { dispatch_predicate(0.0f); dispatch_predicate(0.5f); dispatch_predicate(1.0f); } else { dispatch_predicate(get_probability(Probability)); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_two_way_flag_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partition_two_way,subalgo:flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, const hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } DataType* d_input{}; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); FlagType* d_flags_0{}; FlagType* d_flags_1{}; FlagType* d_flags_2{}; HIP_CHECK(hipMalloc(&d_flags_0, size * sizeof(*d_flags_0))); HIP_CHECK( hipMemcpy(d_flags_0, flags_0.data(), size * sizeof(*d_flags_0), hipMemcpyHostToDevice)); if(is_tuning) { HIP_CHECK(hipMalloc(&d_flags_1, size * sizeof(*d_flags_1))); HIP_CHECK(hipMemcpy(d_flags_1, flags_1.data(), size * sizeof(*d_flags_1), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_flags_2, size * sizeof(*d_flags_2))); HIP_CHECK(hipMemcpy(d_flags_2, flags_2.data(), size * sizeof(*d_flags_2), hipMemcpyHostToDevice)); } DataType* d_output_selected{}; HIP_CHECK(hipMalloc(&d_output_selected, size * sizeof(*d_output_selected))); DataType* d_output_rejected{}; HIP_CHECK(hipMalloc(&d_output_rejected, size * sizeof(*d_output_rejected))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](FlagType* d_flags) { HIP_CHECK(rocprim::partition_two_way(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output_selected, d_output_rejected, d_selected_count_output, size, stream)); }; dispatch_flags(d_flags_0); if(is_tuning) { dispatch_flags(d_flags_1); dispatch_flags(d_flags_2); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); if(is_tuning) { HIP_CHECK(hipFree(d_flags_2)); HIP_CHECK(hipFree(d_flags_1)); } HIP_CHECK(hipFree(d_flags_0)); HIP_CHECK(hipFree(d_output_selected)); HIP_CHECK(hipFree(d_output_rejected)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_two_way_predicate_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partition_two_way,subalgo:predicate,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, const hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); DataType* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); DataType* d_output_selected; HIP_CHECK(hipMalloc(&d_output_selected, size * sizeof(*d_output_selected))); DataType* d_output_rejected; HIP_CHECK(hipMalloc(&d_output_rejected, size * sizeof(*d_output_selected))); unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](float probability) { auto predicate = [probability](const DataType& value) -> bool { return value < static_cast(127 * probability); }; HIP_CHECK(rocprim::partition_two_way(d_temp_storage, temp_storage_size_bytes, d_input, d_output_selected, d_output_rejected, d_selected_count_output, size, predicate, stream)); }; if(is_tuning) { dispatch_predicate(0.0f); dispatch_predicate(0.5f); dispatch_predicate(1.0f); } else { dispatch_predicate(get_probability(Probability)); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output_selected)); HIP_CHECK(hipFree(d_output_rejected)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_three_way_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:partition_three_way,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, const hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); DataType* d_input{}; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); DataType* d_output_first{}; HIP_CHECK(hipMalloc(&d_output_first, size * sizeof(*d_output_first))); DataType* d_output_second{}; HIP_CHECK(hipMalloc(&d_output_second, size * sizeof(*d_output_second))); DataType* d_output_unselected{}; HIP_CHECK(hipMalloc(&d_output_unselected, size * sizeof(*d_output_unselected))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, 2 * sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](std::pair probability) { const float probability_one = probability.first; auto predicate_one = [probability_one](const DataType& value) { return value < DataType(127 * probability_one); }; const float probability_two = probability.second; auto predicate_two = [probability_two](const DataType& value) { return value < DataType(127 * probability_two); }; HIP_CHECK(rocprim::partition_three_way(d_temp_storage, temp_storage_size_bytes, d_input, d_output_first, d_output_second, d_output_unselected, d_selected_count_output, size, predicate_one, predicate_two, stream)); }; if(is_tuning) { // clang-format off std::array, 7> probabilities = {{ {0.33f, 0.66f}, // 1st, 2nd, and 3rd bin {0.50f, 1.00f}, // 1st and 2nd bin {0.00f, 0.50f}, // 2nd and 3rd bin {0.50f, 0.50f}, // 1st and 3rd bin {1.00f, 1.00f}, // 1st bin {0.00f, 1.00f}, // 2nd bin {0.00f, 0.00f}}}; // 3rd bin // clang-format on for(const std::pair& probability : probabilities) { dispatch_predicate(probability); } } else { dispatch_predicate(get_probability(Probability)); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output_first)); HIP_CHECK(hipFree(d_output_second)); HIP_CHECK(hipFree(d_output_unselected)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == partition_three_way_probability::tuning; }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_partition_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::select_config; storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); } }; static void create(std::vector>& storage) { static constexpr int max_items_per_thread = std::min(64 / sizeof(DataType), size_t{32}); static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_PARTITION_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort.cpp000066400000000000000000000062221502235215600241050ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_radix_sort.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; add_sort_keys_benchmarks(benchmarks, bytes, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort.hpp000066400000000000000000000441741502235215600241220ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include namespace rp = rocprim; template struct device_radix_sort_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg: default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> std::enable_if_t::value, void> { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, static_cast(nullptr), static_cast(nullptr), size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, static_cast(nullptr), static_cast(nullptr), size, stream)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, static_cast(nullptr), static_cast(nullptr), size, stream)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> std::enable_if_t::value, void> { using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, const managed_seed& seed, hipStream_t stream) const override { do_run(state, size, seed, stream); } private: template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && std::is_same::value, hipError_t> { (void)values_input; (void)values_output; return rp::radix_sort_keys(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, size, 0, sizeof(K) * 8, stream); } template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && std::is_same::value, hipError_t> { (void)values_input; (void)values_output; return rp::radix_sort_keys(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, size, custom_type_decomposer{}, stream); } template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && !std::is_same::value, hipError_t> { return rp::radix_sort_pairs(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, values_input, values_output, size, 0, sizeof(K) * 8, stream); } template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && !std::is_same::value, hipError_t> { return rp::radix_sort_pairs(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, values_input, values_output, size, custom_type_decomposer{}, stream); } }; #define CREATE_RADIX_SORT_BENCHMARK(...) \ { \ const device_radix_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } inline void add_sort_keys_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { using custom_key = custom_type; CREATE_RADIX_SORT_BENCHMARK(int) CREATE_RADIX_SORT_BENCHMARK(float) CREATE_RADIX_SORT_BENCHMARK(long long) CREATE_RADIX_SORT_BENCHMARK(int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half) CREATE_RADIX_SORT_BENCHMARK(short) CREATE_RADIX_SORT_BENCHMARK(custom_key) } inline void add_sort_pairs_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_key = custom_type; CREATE_RADIX_SORT_BENCHMARK(int, float) CREATE_RADIX_SORT_BENCHMARK(int, double) CREATE_RADIX_SORT_BENCHMARK(int, float2) CREATE_RADIX_SORT_BENCHMARK(int, custom_float2) CREATE_RADIX_SORT_BENCHMARK(int, double2) CREATE_RADIX_SORT_BENCHMARK(int, custom_double2) CREATE_RADIX_SORT_BENCHMARK(long long, float) CREATE_RADIX_SORT_BENCHMARK(long long, double) CREATE_RADIX_SORT_BENCHMARK(long long, float2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_float2) CREATE_RADIX_SORT_BENCHMARK(long long, double2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_double2) CREATE_RADIX_SORT_BENCHMARK(int8_t, int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t, uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half, rocprim::half) CREATE_RADIX_SORT_BENCHMARK(custom_key, double) } #endif // ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort_block_sort.cpp000066400000000000000000000123341502235215600263270ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include "benchmark_device_radix_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK(...) \ { \ const device_radix_sort_block_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_char_double = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(int, custom_char_double) CREATE_BENCHMARK(long long, custom_double2) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort_block_sort.parallel.cpp.in000066400000000000000000000026651502235215600305350ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_radix_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_radix_sort_block_sort_benchmark_generator<@BlockSize@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort_block_sort.parallel.hpp000066400000000000000000000336241502235215600301340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include namespace rp = rocprim; template std::string config_name() { const rocprim::detail::kernel_config_params config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_radix_sort_block_sort_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:radix_sort_block_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); rocprim::empty_type* values_ptr = nullptr; unsigned int items_per_block; // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( (rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); unsigned int items_per_block; HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( (rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, const managed_seed& seed, hipStream_t stream) const override { do_run(state, size, seed, stream); } }; template struct device_radix_sort_block_sort_benchmark_generator { template struct create_ipt { using generated_config = rocprim::kernel_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_radix_sort_block_sort_benchmark>()); } }; static void create(std::vector>& storage) { // Sort_items_per_block must be equal or larger than radix_items_per_block, so make // the items_per_thread at least as large so the sort_items_per_block // would be atleast 1024. static constexpr unsigned int min_items_per_thread = 1024 / BlockSize; // Very large block sizes don't work with large items_per_blocks since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX - 2000; static constexpr unsigned int max_size_per_element = std::max(sizeof(Key), sizeof(Value)); static constexpr unsigned int max_items_per_thread = std::min(32u, max_shared_memory / (BlockSize * max_size_per_element)); static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort_onesweep.cpp000066400000000000000000000105001502235215600260040ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include "benchmark_device_radix_sort_onesweep.parallel.hpp" #include "benchmark_utils.hpp" #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING add_sort_keys_benchmarks(benchmarks, bytes, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, seed, stream); #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort_onesweep.parallel.cpp.in000066400000000000000000000027021502235215600302110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_radix_sort_onesweep.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_radix_sort_onesweep_benchmark_generator<@BlockSize@, @RadixBits@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_radix_sort_onesweep.parallel.hpp000066400000000000000000000570351502235215600276220ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include namespace rp = rocprim; constexpr const char* radix_rank_algorithm_name(rp::block_radix_rank_algorithm algorithm) { switch(algorithm) { case rp::block_radix_rank_algorithm::basic: return "block_radix_rank_algorithm::basic"; case rp::block_radix_rank_algorithm::basic_memoize: return "block_radix_rank_algorithm::basic_memoize"; case rp::block_radix_rank_algorithm::match: return "block_radix_rank_algorithm::match"; } return ""; // unknown algorithm } template std::string config_name() { constexpr rocprim::detail::radix_sort_onesweep_config_params params = Config(); return "{histogram:{bs:" + std::to_string(params.histogram.block_size) + ",ipt:" + std::to_string(params.histogram.items_per_thread) + "},sort:{" + "bs:" + std::to_string(params.sort.block_size) + ",ipt:" + std::to_string(params.sort.items_per_thread) + "},bits_per_place:" + std::to_string(params.radix_bits_per_place) + ",algorithm:" + radix_rank_algorithm_name(params.radix_rank_algorithm) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_radix_sort_onesweep_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:radix_sort_onesweep,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; bool is_result_in_output = true; rocprim::empty_type* d_values_ptr = nullptr; HIP_CHECK((rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( (rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( (rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; bool is_result_in_output = true; HIP_CHECK((rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_input, nullptr, d_values_output, size, is_result_in_output, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( (rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_input, nullptr, d_values_output, size, is_result_in_output, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( (rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_input, nullptr, d_values_output, size, is_result_in_output, rp::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, const managed_seed& seed, hipStream_t stream) const override { do_run(state, size, seed, stream); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_radix_sort_onesweep_benchmark_generator { template static constexpr bool is_buildable() { // Calculation uses `rocprim::arch::wavefront::min_size()`, which is 64 on host side unless overridden. // However, this does not affect the total size of shared memory for the current configuration space. // Were the implementation to change, causing retuning, this needs to be re-evaluated and possibly taken into account. using sharedmem_storage = typename rp::detail::onesweep_iteration_helper::storage_type; return sizeof(sharedmem_storage) < TUNING_SHARED_MEMORY_MAX; } template struct create_ipt; template struct create_ipt())>> { using generated_config = rocprim::radix_sort_onesweep_config, rocprim::kernel_config, RadixBits, RadixRankAlgorithm>; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_radix_sort_onesweep_benchmark>()); } }; template struct create_ipt())>> { void operator()(std::vector>&) const {} }; template static void create_algo(std::vector>& storage) { create_ipt<1u, RadixRankAlgorithm>()(storage); create_ipt<4u, RadixRankAlgorithm>()(storage); create_ipt<6u, RadixRankAlgorithm>()(storage); create_ipt<8u, RadixRankAlgorithm>()(storage); create_ipt<12u, RadixRankAlgorithm>()(storage); create_ipt<16u, RadixRankAlgorithm>()(storage); create_ipt<18u, RadixRankAlgorithm>()(storage); create_ipt<22u, RadixRankAlgorithm>()(storage); } static void create(std::vector>& storage) { create_algo(storage); create_algo(storage); } }; #else // BENCHMARK_CONFIG_TUNING #define CREATE_RADIX_SORT_BENCHMARK(...) \ { \ const device_radix_sort_onesweep_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } inline void add_sort_keys_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { CREATE_RADIX_SORT_BENCHMARK(int) CREATE_RADIX_SORT_BENCHMARK(float) CREATE_RADIX_SORT_BENCHMARK(long long) CREATE_RADIX_SORT_BENCHMARK(int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half) CREATE_RADIX_SORT_BENCHMARK(short) } inline void add_sort_pairs_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_RADIX_SORT_BENCHMARK(int, float) CREATE_RADIX_SORT_BENCHMARK(int, double) CREATE_RADIX_SORT_BENCHMARK(int, float2) CREATE_RADIX_SORT_BENCHMARK(int, custom_float2) CREATE_RADIX_SORT_BENCHMARK(int, double2) CREATE_RADIX_SORT_BENCHMARK(int, custom_double2) CREATE_RADIX_SORT_BENCHMARK(long long, float) CREATE_RADIX_SORT_BENCHMARK(long long, double) CREATE_RADIX_SORT_BENCHMARK(long long, float2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_float2) CREATE_RADIX_SORT_BENCHMARK(long long, double2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_double2) CREATE_RADIX_SORT_BENCHMARK(int8_t, int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t, uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half, rocprim::half) } #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce.cpp000066400000000000000000000120221502235215600231710ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_reduce.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif #define CREATE_BENCHMARK(T, REDUCE_OP) \ { \ const device_reduce_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce.parallel.cpp.in000066400000000000000000000027701502235215600254020ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_reduce.parallel.hpp" namespace { auto benchmark = config_autotune_register::create, rocprim::reduce_config<@BlockSize@u, @ItemsPerThread@u, rocprim::block_reduce_algorithm::using_warp_reduce>>>(); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce.parallel.hpp000066400000000000000000000145121502235215600247770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_REDUCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_REDUCE_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include #include #include #include #include constexpr const char* get_reduce_method_name(rocprim::block_reduce_algorithm alg) { switch(alg) { case rocprim::block_reduce_algorithm::raking_reduce: return "raking_reduce"; case rocprim::block_reduce_algorithm::raking_reduce_commutative_only: return "raking_reduce_commutative_only"; case rocprim::block_reduce_algorithm::using_warp_reduce: return "using_warp_reduce"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::reduce_config_params config = Config(); return "{bs:" + std::to_string(config.reduce_config.block_size) + ",ipt:" + std::to_string(config.reduce_config.items_per_thread) + ",method:" + std::string(get_reduce_method_name(config.block_reduce_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, typename Config = rocprim::default_config> struct device_reduce_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:reduce,key_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(T); BinaryFunction reduce_op{}; const auto random_range = limit_random_range(0, 1000); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; void * d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( rocprim::reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream ) ); HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rocprim::reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rocprim::reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream ) ); } HIP_CHECK(hipStreamSynchronize(stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } }; #endif rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce_by_key.cpp000066400000000000000000000137101502235215600245400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_reduce_by_key.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #include #ifndef DEFAULT_BYTES constexpr size_t DEFAULT_BYTES = size_t{2} << 30; // 2 GiB #endif #define CREATE_BENCHMARK(KEY, VALUE, MAX_SEGMENT_LENGTH) \ { \ const device_reduce_by_key_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } #define CREATE_BENCHMARK_TYPE(KEY, VALUE) \ CREATE_BENCHMARK(KEY, VALUE, 10); \ CREATE_BENCHMARK(KEY, VALUE, 1000) // some of the tuned types #define CREATE_BENCHMARK_TYPES(KEY) \ CREATE_BENCHMARK_TYPE(KEY, int8_t); \ CREATE_BENCHMARK_TYPE(KEY, rocprim::half); \ CREATE_BENCHMARK_TYPE(KEY, int32_t); \ CREATE_BENCHMARK_TYPE(KEY, float); \ CREATE_BENCHMARK_TYPE(KEY, double) // all of the tuned types #define CREATE_BENCHMARK_TYPE_TUNING(KEY) \ CREATE_BENCHMARK_TYPE(KEY, int8_t); \ CREATE_BENCHMARK_TYPE(KEY, int16_t); \ CREATE_BENCHMARK_TYPE(KEY, int32_t); \ CREATE_BENCHMARK_TYPE(KEY, int64_t); \ CREATE_BENCHMARK_TYPE(KEY, rocprim::half); \ CREATE_BENCHMARK_TYPE(KEY, float); \ CREATE_BENCHMARK_TYPE(KEY, double) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, seed, stream); #else // tuned types CREATE_BENCHMARK_TYPES(int8_t); CREATE_BENCHMARK_TYPES(int16_t); CREATE_BENCHMARK_TYPE_TUNING(int32_t); CREATE_BENCHMARK_TYPE_TUNING(int64_t); CREATE_BENCHMARK_TYPES(rocprim::half); CREATE_BENCHMARK_TYPES(float); CREATE_BENCHMARK_TYPES(double); // custom types using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK_TYPE(int, custom_float2); CREATE_BENCHMARK_TYPE(int, custom_double2); CREATE_BENCHMARK_TYPE(long long, custom_float2); CREATE_BENCHMARK_TYPE(long long, custom_double2); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce_by_key.parallel.cpp.in000066400000000000000000000026511502235215600267420ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_reduce_by_key.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_reduce_by_key_benchmark_generator<@KeyType@, @ValueType@, @BlockSize@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce_by_key.parallel.hpp000066400000000000000000000251721502235215600263450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_REDUCE_BY_KEY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_REDUCE_BY_KEY_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include #include #include #include #include template std::string config_name() { const rocprim::detail::reduce_by_key_config_params params = Config(); return "{bs:" + std::to_string(params.kernel_config.block_size) + ",ipt:" + std::to_string(params.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_reduce_by_key_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:reduce_by_key,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",max_segment_length:" + std::to_string(MaxSegmentLength) + ",cfg:" + config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { constexpr int batch_size = 10; constexpr int warmup_size = 5; constexpr std::array tuning_max_segment_lengths = {10, 1000}; constexpr int num_input_arrays = is_tuning ? tuning_max_segment_lengths.size() : 1; constexpr size_t item_size = sizeof(KeyType) + sizeof(ValueType); const size_t size = bytes / item_size; std::vector key_inputs[num_input_arrays]; if(is_tuning) { for(size_t i = 0; i < tuning_max_segment_lengths.size(); ++i) { key_inputs[i] = get_random_segments_iota(size, tuning_max_segment_lengths[i], seed.get_0()); } } else { key_inputs[0] = get_random_segments_iota(size, MaxSegmentLength, seed.get_0()); } std::vector value_input(size); std::iota(value_input.begin(), value_input.end(), 0); KeyType* d_key_inputs[num_input_arrays]; for(int i = 0; i < num_input_arrays; ++i) { HIP_CHECK(hipMalloc(&d_key_inputs[i], size * sizeof(*d_key_inputs[i]))); HIP_CHECK(hipMemcpy(d_key_inputs[i], key_inputs[i].data(), size * sizeof(*d_key_inputs[i]), hipMemcpyHostToDevice)); } ValueType* d_value_input; HIP_CHECK(hipMalloc(&d_value_input, size * sizeof(*d_value_input))); HIP_CHECK(hipMemcpy(d_value_input, value_input.data(), size * sizeof(*d_value_input), hipMemcpyHostToDevice)); KeyType* d_unique_output; ValueType* d_aggregates_output; unsigned int* d_unique_count_output; HIP_CHECK(hipMalloc(&d_unique_output, size * sizeof(*d_unique_output))); HIP_CHECK(hipMalloc(&d_aggregates_output, size * sizeof(*d_aggregates_output))); HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(*d_unique_count_output))); rocprim::plus reduce_op; rocprim::equal_to key_compare_op; const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_input = [&](KeyType* d_key_input) { if ROCPRIM_IF_CONSTEXPR(!Deterministic) { HIP_CHECK(rocprim::reduce_by_key(d_temp_storage, temp_storage_size_bytes, d_key_input, d_value_input, size, d_unique_output, d_aggregates_output, d_unique_count_output, reduce_op, key_compare_op, stream)); } else { HIP_CHECK(rocprim::deterministic_reduce_by_key(d_temp_storage, temp_storage_size_bytes, d_key_input, d_value_input, size, d_unique_output, d_aggregates_output, d_unique_count_output, reduce_op, key_compare_op, stream)); } }; // One tuning iteration runs multiple inputs with different distributions, // preventing overfitting the config to a specific data distrubution. // Note that this does not weigh the inputs/distributions equally as // generally larger segments perform better. for(int i = 0; i < num_input_arrays; ++i) { dispatch_input(d_key_inputs[i]); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * item_size); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temp_storage)); for(int i = 0; i < num_input_arrays; ++i) { HIP_CHECK(hipFree(d_key_inputs[i])); } HIP_CHECK(hipFree(d_value_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_aggregates_output)); HIP_CHECK(hipFree(d_unique_count_output)); } static constexpr bool is_tuning = !std::is_same::value; }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_reduce_by_key_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::reduce_by_key_config; // max segment length argument is irrelevant, tuning overrides segment length storage.emplace_back( std::make_unique< device_reduce_by_key_benchmark>()); } }; static void create(std::vector>& storage) { static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_REDUCE_BY_KEY_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_reduce_by_key_deterministic.cpp000066400000000000000000000116011502235215600274600ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_reduce_by_key.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #include #ifndef DEFAULT_BYTES constexpr size_t DEFAULT_BYTES = size_t{2} << 30; // 2 GiB #endif #define CREATE_BENCHMARK(KEY, VALUE, MAX_SEGMENT_LENGTH) \ { \ const device_reduce_by_key_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } #define CREATE_BENCHMARK_TYPE(KEY, VALUE) \ CREATE_BENCHMARK(KEY, VALUE, 10); \ CREATE_BENCHMARK(KEY, VALUE, 1000) // some of the tuned types #define CREATE_BENCHMARK_TYPES(KEY) \ CREATE_BENCHMARK_TYPE(KEY, int8_t); \ CREATE_BENCHMARK_TYPE(KEY, rocprim::half); \ CREATE_BENCHMARK_TYPE(KEY, int32_t); \ CREATE_BENCHMARK_TYPE(KEY, float); \ CREATE_BENCHMARK_TYPE(KEY, double) // all of the tuned types #define CREATE_BENCHMARK_TYPE_TUNING(KEY) \ CREATE_BENCHMARK_TYPE(KEY, int8_t); \ CREATE_BENCHMARK_TYPE(KEY, int16_t); \ CREATE_BENCHMARK_TYPE(KEY, int32_t); \ CREATE_BENCHMARK_TYPE(KEY, int64_t); \ CREATE_BENCHMARK_TYPE(KEY, rocprim::half); \ CREATE_BENCHMARK_TYPE(KEY, float); \ CREATE_BENCHMARK_TYPE(KEY, double) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; // tuned types CREATE_BENCHMARK_TYPES(int8_t); CREATE_BENCHMARK_TYPES(int16_t); CREATE_BENCHMARK_TYPE_TUNING(int32_t); CREATE_BENCHMARK_TYPE_TUNING(int64_t); CREATE_BENCHMARK_TYPES(rocprim::half); CREATE_BENCHMARK_TYPES(float); CREATE_BENCHMARK_TYPES(double); // custom types using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK_TYPE(int, custom_float2); CREATE_BENCHMARK_TYPE(int, custom_double2); CREATE_BENCHMARK_TYPE(long long, custom_float2); CREATE_BENCHMARK_TYPE(long long, custom_double2); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_run_length_encode.cpp000066400000000000000000000131151502235215600254100ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_run_length_encode.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #ifndef DEFAULT_BYTES constexpr size_t DEFAULT_BYTES = size_t{2} << 30; // 2 GiB #endif #define CREATE_ENCODE_BENCHMARK(T, ML) \ { \ const device_run_length_encode_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } template void add_encode_benchmarks(std::vector& benchmarks, size_t size, const managed_seed& seed, hipStream_t stream) { using custom_float2 = custom_type; using custom_double2 = custom_type; // all tuned types CREATE_ENCODE_BENCHMARK(int8_t, MaxLength); CREATE_ENCODE_BENCHMARK(int16_t, MaxLength); CREATE_ENCODE_BENCHMARK(int32_t, MaxLength); CREATE_ENCODE_BENCHMARK(int64_t, MaxLength); CREATE_ENCODE_BENCHMARK(rocprim::half, MaxLength); CREATE_ENCODE_BENCHMARK(float, MaxLength); CREATE_ENCODE_BENCHMARK(double, MaxLength); // custom types CREATE_ENCODE_BENCHMARK(custom_float2, MaxLength); CREATE_ENCODE_BENCHMARK(custom_double2, MaxLength); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); std::vector benchmarks; // Add benchmarks #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, seed, stream); #else add_encode_benchmarks<1000>(benchmarks, size, seed, stream); add_encode_benchmarks<10>(benchmarks, size, seed, stream); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_run_length_encode.parallel.cpp.in000066400000000000000000000027021502235215600276100ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_run_length_encode.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_run_length_encode_benchmark_generator< @KeyType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_run_length_encode.parallel.hpp000066400000000000000000000223321502235215600272110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif template std::string run_length_encode_config_name() { const rocprim::detail::reduce_by_key_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string run_length_encode_config_name() { return "default_config"; } template struct device_run_length_encode_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:run_length_encode,key_type:" + std::string(Traits::name()) + ",keys_max_length:" + std::to_string(MaxLength) + ",cfg:" + run_length_encode_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using key_type = T; using count_type = unsigned int; const size_t size = bytes / sizeof(T); // Generate data std::vector input(size); unsigned int runs_count = 0; const auto random_range = limit_random_range(1, MaxLength); std::vector key_counts = get_random_data(100000, random_range.first, random_range.second, seed.get_0()); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; ++i) { input[i] = runs_count; } ++runs_count; offset += key_count; } key_type* d_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); key_type* d_unique_output; count_type* d_counts_output; count_type* d_runs_count_output; HIP_CHECK( hipMalloc(reinterpret_cast(&d_unique_output), runs_count * sizeof(key_type))); HIP_CHECK( hipMalloc(reinterpret_cast(&d_counts_output), runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_runs_count_output), sizeof(count_type))); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::run_length_encode(nullptr, temporary_storage_bytes, d_input, size, d_unique_output, d_counts_output, d_runs_count_output, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; ++i) { HIP_CHECK(rocprim::run_length_encode(d_temporary_storage, temporary_storage_bytes, d_input, size, d_unique_output, d_counts_output, d_runs_count_output, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; ++i) { HIP_CHECK(rocprim::run_length_encode(d_temporary_storage, temporary_storage_bytes, d_input, size, d_unique_output, d_counts_output, d_runs_count_output, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_run_length_encode_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::reduce_by_key_config; storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); } }; static void create(std::vector>& storage) { static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_run_length_encode_non_trivial_runs.cpp000066400000000000000000000134361502235215600310710ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #ifndef DEFAULT_BYTES constexpr size_t DEFAULT_BYTES = size_t{2} << 30; // 2 GiB #endif // CHANGE #define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T, ML) \ { \ const device_non_trivial_runs_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ } template void add_non_trivial_runs_benchmarks(std::vector& benchmarks, size_t size, const managed_seed& seed, hipStream_t stream) { using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int16_t, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int32_t, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int64_t, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(rocprim::half, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(float, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(double, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2, MaxLength); CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2, MaxLength); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); std::vector benchmarks; // Add benchmarks #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, seed, stream); #else add_non_trivial_runs_benchmarks<16>(benchmarks, size, seed, stream); add_non_trivial_runs_benchmarks<256>(benchmarks, size, seed, stream); add_non_trivial_runs_benchmarks<4096>(benchmarks, size, seed, stream); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.cpp.in000066400000000000000000000030041502235215600332570ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_non_trivial_runs_benchmark_generator< @KeyType@, @BlockSize@, rocprim::block_load_method::@BlockLoadMethod@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp000066400000000000000000000236151502235215600326710ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_NON_TRIVIAL_RUNS_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_NON_TRIVIAL_RUNS_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif template std::string non_trivial_runs_config_name() { const rocprim::detail::non_trivial_runs_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",load_method:" + get_block_load_method_name(config.load_input_method) + "}"; } template<> inline std::string non_trivial_runs_config_name() { return "default_config"; } template struct device_non_trivial_runs_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:run_length_encode,subalgo:non_trivial,key_type:" + std::string(Traits::name()) + ",keys_max_length:" + std::to_string(MaxLength) + ",cfg:" + non_trivial_runs_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using offset_type = unsigned int; using count_type = unsigned int; constexpr int batch_size = 10; constexpr int warmup_size = 5; constexpr std::array tuning_max_segment_lengths = {10, 1000}; constexpr int num_input_arrays = is_tuning ? tuning_max_segment_lengths.size() : 1; constexpr size_t item_size = sizeof(T) + sizeof(offset_type) + sizeof(count_type); const size_t size = bytes / item_size; // Generate data std::vector input[num_input_arrays]; if(is_tuning) { for(size_t i = 0; i < tuning_max_segment_lengths.size(); ++i) { input[i] = get_random_segments_iota(size, tuning_max_segment_lengths[i], seed.get_0()); } } else { input[0] = get_random_segments_iota(size, MaxLength, seed.get_0()); } T* d_input[num_input_arrays]; for(int i = 0; i < num_input_arrays; ++i) { HIP_CHECK(hipMalloc(&d_input[i], size * sizeof(*d_input[i]))); HIP_CHECK(hipMemcpy(d_input[i], input[i].data(), size * sizeof(*d_input[i]), hipMemcpyHostToDevice)); } offset_type* d_offsets_output; HIP_CHECK(hipMalloc(&d_offsets_output, size * sizeof(*d_offsets_output))); count_type* d_counts_output; HIP_CHECK(hipMalloc(&d_counts_output, size * sizeof(*d_counts_output))); count_type* d_runs_count_output; HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(*d_runs_count_output))); const auto dispatch = [&](void* d_temporary_storage, size_t& temporary_storage_bytes) { const auto dispatch_input = [&](T* d_input) { HIP_CHECK( rocprim::run_length_encode_non_trivial_runs(d_temporary_storage, temporary_storage_bytes, d_input, size, d_offsets_output, d_counts_output, d_runs_count_output, stream, false)); }; for(int i = 0; i < num_input_arrays; ++i) { dispatch_input(d_input[i]); } }; // Allocate temporary storage memory size_t temporary_storage_bytes = 0; dispatch(nullptr, temporary_storage_bytes); void* d_temporary_storage; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(int i = 0; i < warmup_size; ++i) { dispatch(d_temporary_storage, temporary_storage_bytes); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temporary_storage, temporary_storage_bytes); } HIP_CHECK(hipStreamSynchronize(stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * item_size); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); for(int i = 0; i < num_input_arrays; ++i) { HIP_CHECK(hipFree(d_input[i])); } HIP_CHECK(hipFree(d_offsets_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } static constexpr bool is_tuning = !std::is_same::value; }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_non_trivial_runs_benchmark_generator { using OffsetCountPairT = ::rocprim::tuple; static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = std::max(sizeof(T), sizeof(OffsetCountPairT)); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static constexpr unsigned int min_items_per_thread_exponent = 3u; static constexpr bool is_load_warp_transpose = BlockLoadMethod == ::rocprim::block_load_method::block_load_warp_transpose; static constexpr bool is_warp_load_supp = is_load_warp_transpose && BlockSize == ROCPRIM_WARP_SIZE_64; template struct create_ipt { void operator()(std::vector>& storage) { if(!is_load_warp_transpose || is_warp_load_supp) { using config = rocprim::non_trivial_runs_config< BlockSize, items_per_thread, BlockLoadMethod, rocprim::block_scan_algorithm::using_warp_scan>; storage.emplace_back( std::make_unique>()); } } private: static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExp; }; static void create(std::vector>& storage) { static_for_each< make_index_range, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_NON_TRIVIAL_RUNS_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan.cpp000066400000000000000000000124271502235215600226570ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ { \ const device_scan_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan.parallel.cpp.in000066400000000000000000000026521502235215600250560ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_scan_benchmark_generator<@DataType@, rocprim::block_scan_algorithm::@Algo@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan.parallel.hpp000066400000000000000000000301611502235215600244520ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include template std::string config_name() { const rocprim::detail::scan_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_block_scan_algorithm_name(config.block_scan_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, bool Deterministic = false, class Config = rocprim::default_config> struct device_scan_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:scan,exclusive:" + (Exclusive ? "true"s : "false"s) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, ScanOp scan_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { if ROCPRIM_IF_CONSTEXPR(!Deterministic) { return rocprim::exclusive_scan(temporary_storage, storage_size, input, output, initial_value, input_size, scan_op, stream, debug); } else { return rocprim::deterministic_exclusive_scan(temporary_storage, storage_size, input, output, initial_value, input_size, scan_op, stream, debug); } } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, ScanOp scan_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { (void)initial_value; if ROCPRIM_IF_CONSTEXPR(!Deterministic) { return rocprim::inclusive_scan(temporary_storage, storage_size, input, output, input_size, scan_op, stream, debug); } else { return rocprim::deterministic_inclusive_scan(temporary_storage, storage_size, input, output, input_size, scan_op, stream, debug); } } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(T); ScanOp scan_op{}; const auto random_range = limit_random_range(0, 1000); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T initial_value = T(123); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_scan_benchmark_generator { template struct create_block_scan_algorithm { template struct create_block_size { template struct create_ipt { void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique, false, rocprim::scan_config>>()); } }; void operator()(std::vector>& storage) { // Limit items per thread to not over-use shared memory static constexpr unsigned int max_items_per_thread = ::rocprim::min(65536 / (block_size * sizeof(T)), 24); static_for_each, create_ipt>(storage); } static constexpr unsigned int block_size = 1u << BlockSizeExponent; }; static void create(std::vector>& storage) { static_for_each(storage); } }; static void create(std::vector>& storage) { // Block sizes 64, 128, 256 create_block_scan_algorithm>::create(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan_by_key.cpp000066400000000000000000000140271502235215600242170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, MAX_SEGMENT_LENGTH) \ { \ const device_scan_by_key_benchmark, \ MAX_SEGMENT_LENGTH> \ instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 1) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 16) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 256) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 4096) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 65536) #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan_by_key.parallel.cpp.in000066400000000000000000000027041502235215600264160ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_scan_by_key_benchmark_generator<@KeyType@, @ValueType@, rocprim::block_scan_algorithm::@Algo@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan_by_key.parallel.hpp000066400000000000000000000362501502235215600260210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include template std::string config_name() { const rocprim::detail::scan_by_key_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_block_scan_algorithm_name(config.block_scan_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, class CompareOp = rocprim::equal_to, unsigned int MaxSegmentLength = 1024, bool Deterministic = false, class Config = rocprim::default_config> struct device_scan_by_key_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:scan_by_key,exclusive:" + (Exclusive ? "true"s : "false"s) + ",key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",max_segment_length:" + std::to_string(MaxSegmentLength) + ",cfg:" + config_name() + "}"); } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, const Key* keys, const Value* input, Value* output, const Value initial_value, const size_t input_size, const ScanOp scan_op, const CompareOp compare_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { if ROCPRIM_IF_CONSTEXPR(!Deterministic) { return rocprim::exclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, initial_value, input_size, scan_op, compare_op, stream, debug); } else { return rocprim::deterministic_exclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, initial_value, input_size, scan_op, compare_op, stream, debug); } } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, const Key* keys, const Value* input, Value* output, const Value /*initial_value*/, const size_t input_size, const ScanOp scan_op, const CompareOp compare_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { if ROCPRIM_IF_CONSTEXPR(!Deterministic) { return rocprim::inclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, input_size, scan_op, compare_op, stream, debug); } else { return rocprim::deterministic_inclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, input_size, scan_op, compare_op, stream, debug); } } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(Value); constexpr bool debug = false; const std::vector keys = get_random_segments(size, MaxSegmentLength, seed.get_0()); const auto random_range = limit_random_range(0, 1000); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_1()); ScanOp scan_op{}; CompareOp compare_op{}; Value initial_value = Value(123); Value* d_input; Key* d_keys; Value* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_keys, keys.size() * sizeof(keys[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(input[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_keys, keys.data(), keys.size() * sizeof(keys[0]), hipMemcpyHostToDevice)); // Allocate temporary storage memory size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, compare_op, stream, debug))); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, compare_op, stream, debug))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, compare_op, stream, debug))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(Key) + sizeof(Value))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_scan_by_key_benchmark_generator { template struct create_block_scan_algorithm { template struct create_block_size { template struct create_ipt { void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique, rocprim::equal_to, 1024, false, rocprim::scan_by_key_config< block_size, ItemsPerThread, rocprim::block_load_method::block_load_transpose, rocprim::block_store_method::block_store_transpose, BlockScanAlgorithm>>>()); } }; void operator()(std::vector>& storage) { // Limit items per thread to not over-use shared memory static constexpr unsigned int max_items_per_thread = ::rocprim::min( 65536 / (block_size * (sizeof(KeyType) + sizeof(ValueType))), 24); static_for_each, create_ipt>(storage); } static constexpr unsigned int block_size = 1u << BlockSizeExponent; }; static void create(std::vector>& storage) { static_for_each(storage); } }; static void create(std::vector>& storage) { // Block sizes 64, 128, 256 create_block_scan_algorithm>::create(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan_by_key_deterministic.cpp000066400000000000000000000121051502235215600271350ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, MAX_SEGMENT_LENGTH) \ { \ const device_scan_by_key_benchmark, \ MAX_SEGMENT_LENGTH, \ true> \ instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 1) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 16) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 256) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 4096) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 65536) #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_scan_deterministic.cpp000066400000000000000000000103701502235215600255750ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ { \ const device_scan_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_search.cpp000066400000000000000000000111011502235215600231640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_search.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_BENCHMARK_SEARCH(TYPE, KEY_SIZE, REPEATING) \ { \ const device_search_benchmark instance(KEY_SIZE, REPEATING); \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_BENCHMARK_PATTERN(TYPE, REPEATING) \ { \ CREATE_BENCHMARK_SEARCH(TYPE, 10, REPEATING) \ CREATE_BENCHMARK_SEARCH(TYPE, 100, REPEATING) \ CREATE_BENCHMARK_SEARCH(TYPE, 1000, REPEATING) \ CREATE_BENCHMARK_SEARCH(TYPE, 10000, REPEATING) \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PATTERN(TYPE, true) \ CREATE_BENCHMARK_PATTERN(TYPE, false) \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("bytes", "bytes", DEFAULT_BYTES, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("bytes"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_search.hpp000066400000000000000000000163711502235215600232070ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEARCH_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEARCH_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include template struct device_search_benchmark : public config_autotune_interface { size_t key_size_ = 10; bool repeating_ = false; device_search_benchmark(size_t KeySize, bool repeating) { key_size_ = KeySize; repeating_ = repeating; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:search,value_pattern:" + (repeating_ ? "repeating"s : "random"s) + ",key_size:" + std::to_string(key_size_) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using key_type = Key; using output_type = size_t; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t key_size = std::min(size, key_size_); // Generate data std::vector keys_input = get_random_data(key_size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector input(size); if(repeating_) { // Repeating similar pattern without early exits. keys_input[key_size - 1] = 0; for(size_t i = 0; i < size; i++) { input[i] = keys_input[i % key_size]; } keys_input[key_size - 1] = 1; } else { input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0() + 1); } key_type* d_keys_input; key_type* d_input; output_type* d_output; HIP_CHECK(hipMalloc(&d_keys_input, key_size * sizeof(*d_keys_input))); HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), key_size * sizeof(*d_keys_input), hipMemcpyHostToDevice)); rocprim::equal_to compare_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::search(d_temporary_storage, temporary_storage_bytes, d_input, d_keys_input, d_output, size, key_size, compare_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rocprim::search(d_temporary_storage, temporary_storage_bytes, d_input, d_keys_input, d_output, size, key_size, compare_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::search(d_temporary_storage, temporary_storage_bytes, d_input, d_keys_input, d_output, size, key_size, compare_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(*d_input)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEARCH_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_search_n.cpp000066400000000000000000000055721502235215600235200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_search_n.parallel.hpp" int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", size_t{2} << 30, "number of input bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks{}; add_benchmark_search_n(benchmarks, seed, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); clean_up_benchmarks_search_n(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_search_n.parallel.cpp.in000066400000000000000000000026411502235215600257120ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_search_n.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_search_n_benchmark_generator< @InputType@, @BlockSize@>::create); }rocPRIM-rocm-6.4.3/benchmark/benchmark_device_search_n.parallel.hpp000066400000000000000000000377401502235215600253220ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEARCH_N_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEARCH_N_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "cmdparser.hpp" // gbench #include // HIP #include // rocPRIM #include #include // C++ Standard Library #include #include #include #include #include #include #include #include using custom_int2 = custom_type; using custom_double2 = custom_type; using custom_longlong_double = custom_type; namespace { template struct type_arr { using type = First; using next = type_arr; }; template struct type_arr { using type = First; }; template using void_type = void; template constexpr bool is_type_arr_end = true; template constexpr bool is_type_arr_end> = false; template inline unsigned int search_n_get_item_per_block() { using input_type = InputType; using config = Config; using wrapped_config = rocprim::detail::wrapped_search_n_config; hipStream_t stream = 0; // default rocprim::detail::target_arch target_arch; HIP_CHECK(rocprim::detail::host_target_arch(stream, target_arch)); const auto params = rocprim::detail::dispatch_target_arch(target_arch); const unsigned int block_size = params.kernel_config.block_size; const unsigned int items_per_thread = params.kernel_config.items_per_thread; const unsigned int items_per_block = block_size * items_per_thread; return items_per_block; } enum class benchmark_search_n_mode { NORMAL = 0, NOISE = 1, }; inline std::string to_string(benchmark_search_n_mode e) noexcept { switch(e) { case benchmark_search_n_mode::NORMAL: return "NORMAL"; case benchmark_search_n_mode::NOISE: return "NOISE"; default: return "UNKNOWN"; } } } // namespace template class benchmark_search_n { public: const managed_seed seed; const hipStream_t stream; size_t size_byte; size_t count_byte; size_t start_pos_byte; InputType value; std::vector input; private: size_t size; size_t count; size_t start_pos; const size_t warmup_size = 10; const size_t batch_size = 10; size_t temp_storage_size = 0; size_t noise_sequence = 0; bool create_noise = false; hipEvent_t start; hipEvent_t stop; void* d_temp_storage = nullptr; InputType* d_input; OutputType* d_output; InputType* d_value; void create() noexcept { switch(mode) { case benchmark_search_n_mode::NORMAL: { input.resize(size); if(start_pos + count < size) { std::fill(input.begin(), input.begin() + start_pos, 0); std::fill(input.begin() + start_pos, input.begin() + count + start_pos, value); std::fill(input.begin() + count + start_pos, input.end(), 0); } else { std::fill(input.begin(), input.end(), 0); } break; } case benchmark_search_n_mode::NOISE: { InputType h_noise{0}; input = std::vector(size, value); if(create_noise) { size_t cur_tile = 0; size_t last_tile = size / count - 1; while(cur_tile != last_tile) { input[cur_tile * count + count - 1] = h_noise; ++cur_tile; } } break; } default: { break; } } HIP_CHECK(hipMallocAsync(&d_value, sizeof(InputType), stream)); HIP_CHECK(hipMallocAsync(&d_input, sizeof(InputType) * input.size(), stream)); HIP_CHECK(hipMallocAsync(&d_output, sizeof(OutputType), stream)); HIP_CHECK( hipMemcpyAsync(d_value, &value, sizeof(InputType), hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_input, input.data(), sizeof(InputType) * input.size(), hipMemcpyHostToDevice, stream)); HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); } void release() noexcept { decltype(input) tmp; input.swap(tmp); // clear input memspace HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); HIP_CHECK(hipFree(d_value)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } void launch_search_n() { HIP_CHECK(::rocprim::search_n(d_temp_storage, temp_storage_size, d_input, d_output, size, count, d_value, rocprim::equal_to{}, stream, false)); } static void run(benchmark::State& state, benchmark_search_n const* _self) { auto& self = *const_cast(_self); self.create(); // allocate memory self.launch_search_n(); HIP_CHECK(hipMallocAsync(&self.d_temp_storage, self.temp_storage_size, self.stream)); // Warm-up for(size_t i = 0; i < self.warmup_size; i++) { self.launch_search_n(); } HIP_CHECK(hipStreamSynchronize(self.stream)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(self.start, self.stream)); for(size_t i = 0; i < self.batch_size; i++) { self.launch_search_n(); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(self.stop, self.stream)); HIP_CHECK(hipEventSynchronize(self.stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, self.start, self.stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Clean-up HIP_CHECK(hipFree(self.d_temp_storage)); self.d_temp_storage = nullptr; self.temp_storage_size = 0; state.SetBytesProcessed(state.iterations() * self.batch_size * self.size * sizeof(*(self.d_input))); state.SetItemsProcessed(state.iterations() * self.batch_size * self.size); self.release(); } public: benchmark_search_n( const managed_seed _seed, const hipStream_t _stream, const size_t _size_byte, const size_t _count_byte, // for NOISE benchmarks, this is the multiple of count const size_t _start_pos_byte) noexcept : seed(_seed) , stream(_stream) , size_byte(_size_byte) , count_byte(_count_byte) , start_pos_byte(_start_pos_byte) , value{1} , input() { switch(mode) { case benchmark_search_n_mode::NORMAL: { size = size_byte / sizeof(InputType); count = count_byte / sizeof(InputType); start_pos = start_pos_byte / sizeof(InputType); break; } case benchmark_search_n_mode::NOISE: { size = size_byte / sizeof(InputType); count = count_byte; noise_sequence = _start_pos_byte == (size_t)-1 ? search_n_get_item_per_block() : _start_pos_byte; if(size > noise_sequence * count) { count = noise_sequence * count; create_noise = true; } break; } } } benchmark::internal::Benchmark* bench_register() const noexcept { return benchmark::RegisterBenchmark( bench_naming::format_name( "{lvl:device,algo:search_n,input_type:" + std::string(typeid(InputType).name()) + ",size:" + std::to_string(size) + ",count:" + std::to_string(count) + ",mode:" + to_string(mode) + ",cfg:default_config}") .c_str(), run, this); } }; using destructor_t = std::function; static std::vector destructors; static void clean_up_benchmarks_search_n() { for(auto& i : destructors) { i(); } destructors = {}; } template inline void add_one_benchmark_search_n(std::vector& benchmarks, const managed_seed _seed, const hipStream_t _stream, const size_t _size_byte) { // normal auto start_from_0 = new benchmark_search_n(_seed, _stream, _size_byte, _size_byte, 0); auto start_from_mid = new benchmark_search_n(_seed, _stream, _size_byte, _size_byte / 2, _size_byte / 2); // small count test auto small_count6 = new benchmark_search_n(_seed, _stream, _size_byte, 1, // count times 6); // mid count test auto mid_count4095 = new benchmark_search_n(_seed, _stream, _size_byte, 1, // count times 4095); // big input auto big_count6 = new benchmark_search_n(_seed, _stream, _size_byte, 6, // count times (size_t)-1); std::vector bs = {start_from_0->bench_register(), start_from_mid->bench_register(), small_count6->bench_register(), mid_count4095->bench_register(), big_count6->bench_register()}; destructors.emplace_back( [=]() { delete start_from_0; delete start_from_mid; delete small_count6; delete mid_count4095; delete big_count6; }); benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } template, bool> = true> inline void add_benchmark_search_n(std::vector& benchmarks, const managed_seed _seed, const hipStream_t _stream, const size_t _size_byte) { add_one_benchmark_search_n(benchmarks, _seed, _stream, _size_byte); add_benchmark_search_n(benchmarks, _seed, _stream, _size_byte); } template, bool> = true> inline void add_benchmark_search_n(std::vector& benchmarks, const managed_seed _seed, const hipStream_t _stream, const size_t _size_byte) { add_one_benchmark_search_n(benchmarks, _seed, _stream, _size_byte); } typedef type_arr benchmark_search_n_types; template struct device_search_n_benchmark_generator { // TODO: add implementation struct create_search_n_algorithm {}; // TODO: add implementation static void create(std::vector>&) {} }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEARCH_N_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort_keys.cpp000066400000000000000000000324351502235215600272000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; namespace { constexpr unsigned int warmup_size = 2; constexpr size_t min_size = 30000; constexpr std::array segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; constexpr std::array segment_lengths{30, 256, 3000, 300000}; } // namespace // This benchmark only handles the rocprim::segmented_radix_sort_keys function. The benchmark was separated into two (keys and pairs), // because the binary became too large to link. Runs into a "relocation R_X86_64_PC32 out of range" error. // This happens partially, because of the algorithm has 4 kernels, and decides at runtime which one to call. template void run_sort_keys_benchmark(benchmark::State& state, size_t num_segments, size_t mean_segment_length, size_t target_bytes, const managed_seed& seed, hipStream_t stream) { using offset_type = int; using key_type = Key; // Calculate the number of elements size_t target_size = target_bytes / sizeof(key_type); std::vector offsets; offsets.push_back(0); static constexpr int iseed = 716; engine_type gen(iseed); std::normal_distribution segment_length_dis(static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if(segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); size_t batch_size = 1; if(size < target_size) { batch_size = (target_size + size - 1) / size; } offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, offsets.size() * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), offsets.size() * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::segmented_radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void add_sort_keys_benchmarks(std::vector& benchmarks, size_t max_bytes, size_t min_size, size_t target_size, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements size_t max_size = max_bytes / sizeof(KeyT); std::string key_name = Traits::name(); std::string value_name = Traits::name(); for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements > max_size || number_of_elements < min_size) { continue; } benchmarks.push_back(benchmark::RegisterBenchmark( bench_naming::format_name( "{lvl:device,algo:radix_sort_segmented,key_type:" + key_name + ",value_type:" + value_name + ",segment_count:" + std::to_string(segment_count) + ",segment_length:" + std::to_string(segment_length) + ",cfg:default_config}") .c_str(), [=](benchmark::State& state) { run_sort_keys_benchmark(state, segment_count, segment_length, target_size, seed, stream); })); } } } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; #ifdef BENCHMARK_CONFIG_TUNING (void)min_size; const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, min_size, seed, stream); #else add_sort_keys_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_keys_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_keys_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_keys_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_keys_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_keys_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort_keys.parallel.cpp.in000066400000000000000000000032731502235215600313760ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_segmented_radix_sort_keys.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_segmented_radix_sort_benchmark_generator< @LongBits@, 0, @BlockSize@, @ItemsPerThread@, @WarpSmallLWS@, @WarpSmallIPT@, @WarpSmallBS@, @WarpPartition@, @WarpMediumLWS@, @WarpMediumIPT@, @WarpMediumBS@, @KeyType@, true >::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort_keys.parallel.hpp000066400000000000000000000325751502235215600310050ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_KEYS_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_KEYS_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include template std::string warp_sort_config_name(T const& warp_sort_config) { return "{pa:" + std::to_string(warp_sort_config.partitioning_allowed) + ",lwss:" + std::to_string(warp_sort_config.logical_warp_size_small) + ",ipts:" + std::to_string(warp_sort_config.items_per_thread_small) + ",bss:" + std::to_string(warp_sort_config.block_size_small) + ",pt:" + std::to_string(warp_sort_config.partitioning_threshold) + ",lwsm:" + std::to_string(warp_sort_config.logical_warp_size_medium) + ",iptm:" + std::to_string(warp_sort_config.items_per_thread_medium) + ",bsm:" + std::to_string(warp_sort_config.block_size_medium) + "}"; } template std::string config_name() { const rocprim::detail::segmented_radix_sort_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",lrb:" + std::to_string(config.long_radix_bits) + ",srb:" + std::to_string(config.short_radix_bits) + ",eupws:" + std::to_string(config.enable_unpartitioned_warp_sort) + ",wsc:" + warp_sort_config_name(config.warp_sort_config) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_segmented_radix_sort_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; const rocprim::detail::segmented_radix_sort_config_params config = Config(); return bench_naming::format_name( "{lvl:device,algo:segmented_radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:empty_type" + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run_benchmark(benchmark::State& state, size_t num_segments, size_t mean_segment_length, size_t target_size, const managed_seed& seed, hipStream_t stream) const { using offset_type = int; using key_type = Key; std::vector offsets; offsets.push_back(0); static constexpr int iseed = 716; engine_type gen(iseed); std::normal_distribution segment_length_dis( static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if(segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); size_t batch_size = 1; if(size < target_size) { batch_size = (target_size + size - 1) / size; } offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, offsets.size() * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), offsets.size() * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::segmented_radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rocprim::segmented_radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::segmented_radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(Key); constexpr std::array segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; constexpr std::array segment_lengths{30, 256, 3000, 300000}; for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements > 33554432 || number_of_elements < 300000) { continue; } run_benchmark(state, segment_count, segment_length, size, seed, stream); } } } }; template class T, bool enable, Tp... Idx> struct decider; template struct device_segmented_radix_sort_benchmark_generator { static void create(std::vector>& storage) { storage.emplace_back(std::make_unique, rocprim::WarpSortConfig, UnpartitionWarpAllowed>>>()); } }; template class T, Tp... Idx> struct decider { inline static void do_the_thing(std::vector>& storage) { static_for_each, T>(storage); } }; template class T, Tp... Idx> struct decider { inline static void do_the_thing(std::vector>& /*storage*/) {} }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_KEYS_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort_pairs.cpp000066400000000000000000000374531502235215600273500ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include "benchmark/benchmark.h" // HIP API #include // rocPRIM #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; namespace { constexpr unsigned int warmup_size = 2; constexpr size_t min_size = 30000; constexpr std::array segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; constexpr std::array segment_lengths{30, 256, 3000, 300000}; } // namespace // This benchmark only handles the rocprim::segmented_radix_sort_pairs function. The benchmark was separated into two (keys and pairs), // because the binary became too large to link. Runs into a "relocation R_X86_64_PC32 out of range" error. // This happens partially, because of the algorithm has 4 kernels, and decides at runtime which one to call. template void run_sort_pairs_benchmark(benchmark::State& state, size_t num_segments, size_t mean_segment_length, size_t target_bytes, const managed_seed& seed, hipStream_t stream) { using offset_type = int; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t target_size = target_bytes / sizeof(key_type); // Generate data std::vector offsets; offsets.push_back(0); static constexpr int iseed = 716; engine_type gen(iseed); std::normal_distribution segment_length_dis(static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if(segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); size_t batch_size = 1; if(size < target_size) { batch_size = (target_size + size - 1) / size; } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::segmented_radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } template void add_sort_pairs_benchmarks(std::vector& benchmarks, size_t max_bytes, size_t min_size, size_t target_size, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements size_t max_size = max_bytes / sizeof(KeyT); std::string key_name = Traits::name(); std::string value_name = Traits::name(); for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements > max_size || number_of_elements < min_size) { continue; } benchmarks.push_back(benchmark::RegisterBenchmark( bench_naming::format_name( "{lvl:device,algo:radix_sort_segmented,key_type:" + key_name + ",value_type:" + value_name + ",segment_count:" + std::to_string(segment_count) + ",segment_length:" + std::to_string(segment_length) + ",cfg:default_config}") .c_str(), [=](benchmark::State& state) { run_sort_pairs_benchmark(state, segment_count, segment_length, target_size, seed, stream); })); } } } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; #ifdef BENCHMARK_CONFIG_TUNING (void)min_size; const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); add_sort_pairs_benchmarks(benchmarks, bytes, min_size, bytes / 2, seed, stream); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.cpp.in000066400000000000000000000033211502235215600315330ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_segmented_radix_sort_pairs.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_segmented_radix_sort_benchmark_generator< @LongBits@, 8, @BlockSize@, @ItemsPerThread@, @WarpSmallLWS@, @WarpSmallIPT@, @WarpSmallBS@, @WarpPartition@, @WarpMediumLWS@, @WarpMediumIPT@, @WarpMediumBS@, @KeyType@, @ValueType@, true >::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.hpp000066400000000000000000000357001502235215600311410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_PAIRS_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_PAIRS_PARALLEL_HPP_ #include "benchmark_utils.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include template std::string warp_sort_config_name(T const& warp_sort_config) { return "{pa:" + std::to_string(warp_sort_config.partitioning_allowed) + ",lwss:" + std::to_string(warp_sort_config.logical_warp_size_small) + ",ipts:" + std::to_string(warp_sort_config.items_per_thread_small) + ",bss:" + std::to_string(warp_sort_config.block_size_small) + ",pt:" + std::to_string(warp_sort_config.partitioning_threshold) + ",lwsm:" + std::to_string(warp_sort_config.logical_warp_size_medium) + ",iptm:" + std::to_string(warp_sort_config.items_per_thread_medium) + ",bsm:" + std::to_string(warp_sort_config.block_size_medium) + "}"; } template std::string config_name() { const rocprim::detail::segmented_radix_sort_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",lrb:" + std::to_string(config.long_radix_bits) + ",srb:" + std::to_string(config.short_radix_bits) + ",eupws:" + std::to_string(config.enable_unpartitioned_warp_sort) + ",wsc:" + warp_sort_config_name(config.warp_sort_config) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_segmented_radix_sort_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; const rocprim::detail::segmented_radix_sort_config_params config = Config(); return bench_naming::format_name("{lvl:device,algo:segmented_radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run_benchmark(benchmark::State& state, size_t num_segments, size_t mean_segment_length, size_t target_size, const managed_seed& seed, hipStream_t stream) const { using offset_type = int; using key_type = Key; using value_type = Value; std::vector offsets; offsets.push_back(0); static constexpr int iseed = 716; engine_type gen(iseed); std::normal_distribution segment_length_dis( static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if(segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector values_input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); size_t batch_size = 1; if(size < target_size) { batch_size = (target_size + size - 1) / size; } offset_type* d_offsets; HIP_CHECK(hipMalloc(&d_offsets, offsets.size() * sizeof(offset_type))); HIP_CHECK(hipMemcpy(d_offsets, offsets.data(), offsets.size() * sizeof(offset_type), hipMemcpyHostToDevice)); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::segmented_radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rocprim::segmented_radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::segmented_radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(Key); constexpr std::array segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; constexpr std::array segment_lengths{30, 256, 3000, 300000}; for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements > 33554432 || number_of_elements < 300000) { continue; } run_benchmark(state, segment_count, segment_length, size, seed, stream); } } } }; template class T, bool enable, Tp... Idx> struct decider; template struct device_segmented_radix_sort_benchmark_generator { static void create(std::vector>& storage) { storage.emplace_back(std::make_unique, rocprim::WarpSortConfig, UnpartitionWarpAllowed>>>()); } }; template class T, Tp... Idx> struct decider { inline static void do_the_thing(std::vector>& storage) { static_for_each, T>(storage); } }; template class T, Tp... Idx> struct decider { inline static void do_the_thing(std::vector>& /*storage*/) {} }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_PAIRS_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_segmented_reduce.cpp000066400000000000000000000225171502235215600252360ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t desired_segments, size_t bytes, const managed_seed& seed, hipStream_t stream) { using offset_type = int; using value_type = T; // Calculate the number of elements size_t size = bytes / sizeof(T); // Generate data engine_type gen(seed.get_0()); const double avg_segment_length = static_cast(size) / desired_segments; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets), (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); value_type * d_aggregates_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_aggregates_output), segments_count * sizeof(value_type))); rocprim::plus reduce_op; value_type init(0); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, reduce_op, init, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, reduce_op, init, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, reduce_op, init, stream ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_aggregates_output)); } #define CREATE_BENCHMARK(T, SEGMENTS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:reduce_segmented,key_type:" #T \ ",segment_count:" \ + std::to_string(SEGMENTS) + ",cfg:default_config}") \ .c_str(), \ run_benchmark, \ SEGMENTS, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 1), \ CREATE_BENCHMARK(type, 10), \ CREATE_BENCHMARK(type, 100), \ CREATE_BENCHMARK(type, 1000), \ CREATE_BENCHMARK(type, 10000) void add_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_TYPE(int), BENCHMARK_TYPE(custom_float2), BENCHMARK_TYPE(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); // fixed seed as a random seed adds a lot of variance parser.set_optional("seed", "seed", "321", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_select.cpp000066400000000000000000000234551502235215600232150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_select.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include "benchmark/benchmark.h" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif #define CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(T, F, p) \ { \ const device_select_predicated_flag_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_SELECT_FLAG_BENCHMARK(T, F, p) \ { \ const device_select_flag_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_SELECT_PREDICATE_BENCHMARK(T, p) \ { \ const device_select_predicate_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_UNIQUE_BENCHMARK(T, p) \ { \ const device_select_unique_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ { \ const device_select_unique_by_key_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } #define BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(type, value) \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p005); \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p025); \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p050); \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p075) #define BENCHMARK_SELECT_FLAG_TYPE(type, value) \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p005); \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p025); \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p050); \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p075) #define BENCHMARK_SELECT_PREDICATE_TYPE(type) \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p005); \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p025); \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p050); \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p075) #define BENCHMARK_UNIQUE_TYPE(type) \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p005); \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p025); \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p050); \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p075) #define BENCHMARK_UNIQUE_BY_KEY_TYPE(K, V) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p005); \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p025); \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p050); \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p075) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else using custom_double2 = custom_type; using custom_int_double = custom_type; BENCHMARK_SELECT_FLAG_TYPE(int, unsigned char); BENCHMARK_SELECT_FLAG_TYPE(float, unsigned char); BENCHMARK_SELECT_FLAG_TYPE(double, unsigned char); BENCHMARK_SELECT_FLAG_TYPE(uint8_t, uint8_t); BENCHMARK_SELECT_FLAG_TYPE(int8_t, int8_t); BENCHMARK_SELECT_FLAG_TYPE(rocprim::half, int8_t); BENCHMARK_SELECT_FLAG_TYPE(custom_double2, unsigned char); BENCHMARK_SELECT_PREDICATE_TYPE(int); BENCHMARK_SELECT_PREDICATE_TYPE(float); BENCHMARK_SELECT_PREDICATE_TYPE(double); BENCHMARK_SELECT_PREDICATE_TYPE(uint8_t); BENCHMARK_SELECT_PREDICATE_TYPE(int8_t); BENCHMARK_SELECT_PREDICATE_TYPE(rocprim::half); BENCHMARK_SELECT_PREDICATE_TYPE(custom_int_double); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(int, unsigned char); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(float, unsigned char); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(double, unsigned char); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(uint8_t, uint8_t); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(int8_t, int8_t); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(rocprim::half, int8_t); BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(custom_double2, unsigned char); BENCHMARK_UNIQUE_TYPE(int); BENCHMARK_UNIQUE_TYPE(float); BENCHMARK_UNIQUE_TYPE(double); BENCHMARK_UNIQUE_TYPE(uint8_t); BENCHMARK_UNIQUE_TYPE(int8_t); BENCHMARK_UNIQUE_TYPE(rocprim::half); BENCHMARK_UNIQUE_TYPE(custom_int_double); BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int); BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double); BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2); BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t); BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double); BENCHMARK_UNIQUE_BY_KEY_TYPE(rocprim::half, rocprim::half); BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double); #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_select.parallel.cpp.in000066400000000000000000000026331502235215600254100ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_select.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_select_benchmark_generator<@KeyType@, @ValueType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-6.4.3/benchmark/benchmark_device_select.parallel.hpp000066400000000000000000001026511502235215600250110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SELECT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SELECT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include #include #include #include #include #include #include #include #include enum class select_probability { p005, p025, p050, p075, tuning }; inline float get_probability(select_probability probability) { switch(probability) { case select_probability::p005: return 0.05f; case select_probability::p025: return 0.25f; case select_probability::p050: return 0.50f; case select_probability::p075: return 0.75f; case select_probability::tuning: return 0.0f; // not used } return 0.0f; } inline const char* get_probability_name(select_probability probability) { switch(probability) { case select_probability::p005: return "0.05"; case select_probability::p025: return "0.25"; case select_probability::p050: return "0.50"; case select_probability::p075: return "0.75"; case select_probability::tuning: return "tuning"; } return "invalid"; } constexpr int warmup_iter = 5; constexpr int batch_size = 10; template struct device_select_flag_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } DataType* d_input{}; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); FlagType* d_flags_0{}; FlagType* d_flags_1{}; FlagType* d_flags_2{}; HIP_CHECK(hipMalloc(&d_flags_0, size * sizeof(*d_flags_0))); HIP_CHECK( hipMemcpy(d_flags_0, flags_0.data(), size * sizeof(*d_flags_0), hipMemcpyHostToDevice)); if(is_tuning) { HIP_CHECK(hipMalloc(&d_flags_1, size * sizeof(*d_flags_1))); HIP_CHECK(hipMemcpy(d_flags_1, flags_1.data(), size * sizeof(*d_flags_1), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_flags_2, size * sizeof(*d_flags_2))); HIP_CHECK(hipMemcpy(d_flags_2, flags_2.data(), size * sizeof(*d_flags_2), hipMemcpyHostToDevice)); } DataType* d_output{}; HIP_CHECK(hipMalloc(&d_output, size * sizeof(*d_output))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](FlagType* d_flags) { HIP_CHECK(rocprim::select(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, size, stream)); }; dispatch_flags(d_flags_0); if(is_tuning) { dispatch_flags(d_flags_1); dispatch_flags(d_flags_2); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; i++) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); if(is_tuning) { HIP_CHECK(hipFree(d_flags_2)); HIP_CHECK(hipFree(d_flags_1)); } HIP_CHECK(hipFree(d_flags_0)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template struct device_select_predicate_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:predicate,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); DataType* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); DataType* d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(*d_output))); unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](float probability) { auto predicate = [probability](const DataType& value) -> bool { return value < static_cast(127 * probability); }; HIP_CHECK(rocprim::select(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, size, predicate, stream)); }; if(is_tuning) { dispatch_predicate(0.0f); dispatch_predicate(0.5f); dispatch_predicate(1.0f); } else { dispatch_predicate(get_probability(Probability)); } }; size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template struct device_select_predicated_flag_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:select,subalgo:predicated_flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, generate_limits::min(), generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } DataType* d_input{}; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(*d_input), hipMemcpyHostToDevice)); FlagType* d_flags_0{}; FlagType* d_flags_1{}; FlagType* d_flags_2{}; HIP_CHECK(hipMalloc(&d_flags_0, size * sizeof(*d_flags_0))); HIP_CHECK( hipMemcpy(d_flags_0, flags_0.data(), size * sizeof(*d_flags_0), hipMemcpyHostToDevice)); if(is_tuning) { HIP_CHECK(hipMalloc(&d_flags_1, size * sizeof(*d_flags_1))); HIP_CHECK(hipMemcpy(d_flags_1, flags_1.data(), size * sizeof(*d_flags_1), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_flags_2, size * sizeof(*d_flags_2))); HIP_CHECK(hipMemcpy(d_flags_2, flags_2.data(), size * sizeof(*d_flags_2), hipMemcpyHostToDevice)); } DataType* d_output{}; HIP_CHECK(hipMalloc(&d_output, size * sizeof(*d_output))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicated_flags = [&](FlagType* d_flags) { auto predicate = [](const FlagType& value) -> bool { return value; }; HIP_CHECK(rocprim::select(d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, size, predicate, stream)); }; dispatch_predicated_flags(d_flags_0); if(is_tuning) { dispatch_predicated_flags(d_flags_1); dispatch_predicated_flags(d_flags_2); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; i++) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); if(is_tuning) { HIP_CHECK(hipFree(d_flags_2)); HIP_CHECK(hipFree(d_flags_1)); } HIP_CHECK(hipFree(d_flags_0)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template inline std::vector get_unique_input(size_t size, float probability, unsigned int seed) { using op_type = typename std::conditional::value, half_plus, rocprim::plus>::type; op_type op; std::vector input(size); auto input01 = get_random_data01(size, probability, seed); auto acc = input01[0]; input[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input[i] = op(acc, input01[i]); } return input; } template struct device_select_unique_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:unique,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input_0; std::vector input_1; std::vector input_2; if(is_tuning) { input_0 = get_unique_input(size, 0.0f, seed.get_0()); input_1 = get_unique_input(size, 0.5f, seed.get_0()); input_2 = get_unique_input(size, 1.0f, seed.get_0()); } else { input_0 = get_unique_input(size, get_probability(Probability), seed.get_0()); } DataType* d_input_0{}; DataType* d_input_1{}; DataType* d_input_2{}; HIP_CHECK(hipMalloc(&d_input_0, size * sizeof(*d_input_0))); HIP_CHECK( hipMemcpy(d_input_0, input_0.data(), size * sizeof(*d_input_0), hipMemcpyHostToDevice)); if(is_tuning) { HIP_CHECK(hipMalloc(&d_input_1, size * sizeof(*d_input_1))); HIP_CHECK(hipMemcpy(d_input_1, input_1.data(), size * sizeof(*d_input_1), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_input_2, size * sizeof(*d_input_2))); HIP_CHECK(hipMemcpy(d_input_2, input_2.data(), size * sizeof(*d_input_2), hipMemcpyHostToDevice)); } DataType* d_output{}; HIP_CHECK(hipMalloc(&d_output, size * sizeof(*d_output))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](DataType* d_input) { HIP_CHECK(rocprim::unique(d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, size, rocprim::equal_to(), stream)); }; dispatch_flags(d_input_0); if(is_tuning) { dispatch_flags(d_input_1); dispatch_flags(d_input_2); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(DataType)); state.SetItemsProcessed(state.iterations() * batch_size * size); if(is_tuning) { HIP_CHECK(hipFree(d_input_2)); HIP_CHECK(hipFree(d_input_1)); } HIP_CHECK(hipFree(d_input_0)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template struct device_select_unique_by_key_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:unique_by_key,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { // Calculate the number of elements size_t size = bytes / sizeof(KeyType); std::vector input_keys_0; std::vector input_keys_1; std::vector input_keys_2; if(is_tuning) { input_keys_0 = get_unique_input(size, 0.0f, seed.get_0()); input_keys_1 = get_unique_input(size, 0.5f, seed.get_0()); input_keys_2 = get_unique_input(size, 1.0f, seed.get_0()); } else { input_keys_0 = get_unique_input(size, get_probability(Probability), seed.get_0()); } const auto random_range = limit_random_range(-1000, 1000); const auto input_values = get_random_data(size, random_range.first, random_range.second, seed.get_1()); KeyType* d_keys_input_0{}; KeyType* d_keys_input_1{}; KeyType* d_keys_input_2{}; HIP_CHECK(hipMalloc(&d_keys_input_0, size * sizeof(*d_keys_input_0))); HIP_CHECK(hipMemcpy(d_keys_input_0, input_keys_0.data(), size * sizeof(*d_keys_input_0), hipMemcpyHostToDevice)); if(is_tuning) { HIP_CHECK(hipMalloc(&d_keys_input_1, size * sizeof(*d_keys_input_1))); HIP_CHECK(hipMemcpy(d_keys_input_1, input_keys_1.data(), size * sizeof(*d_keys_input_1), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_keys_input_2, size * sizeof(*d_keys_input_2))); HIP_CHECK(hipMemcpy(d_keys_input_2, input_keys_2.data(), size * sizeof(*d_keys_input_2), hipMemcpyHostToDevice)); } ValueType* d_values_input{}; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(*d_values_input))); HIP_CHECK(hipMemcpy(d_values_input, input_values.data(), size * sizeof(*d_values_input), hipMemcpyHostToDevice)); KeyType* d_keys_output{}; HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(*d_keys_output))); ValueType* d_values_output{}; HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(*d_values_output))); unsigned int* d_selected_count_output{}; HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(*d_selected_count_output))); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](KeyType* d_keys_input) { HIP_CHECK(rocprim::unique_by_key(d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, size, rocprim::equal_to(), stream)); }; dispatch_flags(d_keys_input_0); if(is_tuning) { dispatch_flags(d_keys_input_1); dispatch_flags(d_keys_input_2); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); void* d_temp_storage{}; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); for(int i = 0; i < warmup_iter; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { HIP_CHECK(hipEventRecord(start, stream)); for(int i = 0; i < batch_size; ++i) { dispatch(d_temp_storage, temp_storage_size_bytes); } HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds{}; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(KeyType) + sizeof(ValueType))); state.SetItemsProcessed(state.iterations() * batch_size * size); if(is_tuning) { HIP_CHECK(hipFree(d_keys_input_2)); HIP_CHECK(hipFree(d_keys_input_1)); } HIP_CHECK(hipFree(d_keys_input_0)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_output)); HIP_CHECK(hipFree(d_selected_count_output)); HIP_CHECK(hipFree(d_temp_storage)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; #ifdef BENCHMARK_CONFIG_TUNING template struct create_benchmark { static constexpr unsigned int block_size = Config().kernel_config.block_size; static constexpr unsigned int items_per_thread = Config().kernel_config.items_per_thread; static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(KeyType) + sizeof(ValueType); static constexpr unsigned int max_items_per_thread = max_shared_memory / (block_size * max_size_per_element); void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique>()); if(items_per_thread <= max_items_per_thread) { storage.emplace_back( std::make_unique< device_select_predicated_flag_benchmark>()); } } }; template struct create_benchmark { void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back(std::make_unique>()); } }; template struct device_select_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::select_config; create_benchmark{}(storage); } }; static void create(std::vector>& storage) { static constexpr int max_items_per_thread = std::min(64 / std::max(sizeof(KeyType), sizeof(ValueType)), size_t{32}); static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SELECT_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_device_transform.cpp000066400000000000000000000120101502235215600237320ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_transform.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif #define CREATE_BENCHMARK(T) \ { \ const device_transform_benchmark instance{}; \ REGISTER_BENCHMARK(benchmarks, bytes, seed, stream, instance); \ } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif // BENCHMARK_CONFIG_TUNING parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, bytes, seed, stream); #else // BENCHMARK_CONFIG_TUNING using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(float) CREATE_BENCHMARK(double) CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_transform.parallel.cpp.in000066400000000000000000000026431502235215600261450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_transform.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_transform_benchmark_generator< @DataType@, @BlockSize@>::create); } rocPRIM-rocm-6.4.3/benchmark/benchmark_device_transform.parallel.hpp000066400000000000000000000140451502235215600255440ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_TRANSFORM_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_TRANSFORM_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include "benchmark_utils.hpp" template std::string transform_config_name() { auto config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + "}"; } template<> inline std::string transform_config_name() { return "default_config"; } template struct device_transform_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:transform,value_type:" + std::string(Traits::name()) + ",cfg:" + transform_config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) const override { using output_type = T; // Calculate the number of elements size_t size = bytes / sizeof(T); static constexpr bool debug_synchronous = false; // Generate data const auto random_range = limit_random_range(1, 100); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T* d_input; output_type* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); const auto launch = [&] { auto transform_op = [](T v) { return v + T(5); }; return rocprim::transform(d_input, d_output, size, transform_op, stream, debug_synchronous); }; // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(launch()); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } }; template struct device_transform_benchmark_generator { template struct create_ipt { using generated_config = rocprim::transform_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique>()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread = 0; static constexpr unsigned int max_items_per_thread = rocprim::Log2<16>::VALUE; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_TRANSFORM_PARALLEL_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_predicate_iterator.cpp000066400000000000000000000222101502235215600242540ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 128 * 4; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template struct identity { __device__ T operator()(T value) { return value; } }; template struct less_than { __device__ bool operator()(T value) const { return value < T{C}; } }; template struct increment { __device__ T operator()(T value) const { return value + T{I}; } }; template struct transform_op { __device__ auto operator()(T v) const { return Predicate{}(v) ? Transform{}(v) : v; } }; template struct transform_it { using value_type = T; void operator()(T* d_input, T* d_output, const size_t size, const hipStream_t stream) { auto t_it = rocprim::make_transform_iterator(d_input, transform_op{}); HIP_CHECK(rocprim::transform(t_it, d_output, size, identity{}, stream)); } }; template struct read_predicate_it { using value_type = T; void operator()(T* d_input, T* d_output, const size_t size, const hipStream_t stream) { auto t_it = rocprim::make_transform_iterator(d_input, Transform{}); auto r_it = rocprim::make_predicate_iterator(t_it, d_input, Predicate{}); HIP_CHECK(rocprim::transform(r_it, d_output, size, identity{}, stream)); } }; template struct write_predicate_it { using value_type = T; void operator()(T* d_input, T* d_output, const size_t size, const hipStream_t stream) { auto t_it = rocprim::make_transform_iterator(d_input, Transform{}); auto w_it = rocprim::make_predicate_iterator(d_output, d_input, Predicate{}); HIP_CHECK(rocprim::transform(t_it, w_it, size, identity{}, stream)); } }; template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { using T = typename IteratorBenchmark::value_type; // Calculate the number of elements size_t size = bytes / sizeof(T); const auto random_range = limit_random_range(0, 99); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { IteratorBenchmark{}(d_input, d_output, size, stream); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { IteratorBenchmark{}(d_input, d_output, size, stream); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(B, T, C) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:device,algo:" #B ",p:p" #C \ ",key_type:" #T ",cfg:default_config}") \ .c_str(), \ run_benchmark, increment>>, \ bytes, \ seed, \ stream) // clang-format off #define CREATE_TYPED_BENCHMARK(T) \ CREATE_BENCHMARK(transform_it, T, 0), \ CREATE_BENCHMARK(read_predicate_it, T, 0), \ CREATE_BENCHMARK(write_predicate_it, T, 0), \ CREATE_BENCHMARK(transform_it, T, 25), \ CREATE_BENCHMARK(read_predicate_it, T, 25), \ CREATE_BENCHMARK(write_predicate_it, T, 25), \ CREATE_BENCHMARK(transform_it, T, 50), \ CREATE_BENCHMARK(read_predicate_it, T, 50), \ CREATE_BENCHMARK(write_predicate_it, T, 50), \ CREATE_BENCHMARK(transform_it, T, 75), \ CREATE_BENCHMARK(read_predicate_it, T, 75), \ CREATE_BENCHMARK(write_predicate_it, T, 75), \ CREATE_BENCHMARK(transform_it, T, 100), \ CREATE_BENCHMARK(read_predicate_it, T, 100), \ CREATE_BENCHMARK(write_predicate_it, T, 100) // clang-format on int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); using custom_128 = custom_type; // Add benchmarks std::vector benchmarks = {CREATE_TYPED_BENCHMARK(int8_t), CREATE_TYPED_BENCHMARK(int16_t), CREATE_TYPED_BENCHMARK(int32_t), CREATE_TYPED_BENCHMARK(int64_t), CREATE_TYPED_BENCHMARK(custom_128)}; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_utils.hpp000066400000000000000000001222131502235215600215540ustar00rootroot00000000000000// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_BENCHMARK_UTILS_HPP_ #define ROCPRIM_BENCHMARK_UTILS_HPP_ #include // rocPRIM #include #include #include #include // partition_config_params #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess) \ { \ std::cout << "HIP error: " << hipGetErrorString(error) << " file: " << __FILE__ \ << " line: " << __LINE__ << std::endl; \ exit(error); \ } \ } #define TUNING_SHARED_MEMORY_MAX 65536u // Support half operators on host side inline const char* get_seed_message() { return "seed for input generation, either an unsigned integer value for determinisic results " "or 'random' for different inputs for each repetition"; } /// \brief Provides a sequence of seeds. class managed_seed { public: /// \param[in] seed_string Either "random" to get random seeds, /// or an unsigned integer to get (a sequence) of deterministic seeds. managed_seed(const std::string& seed_string) { is_random = seed_string == "random"; if(!is_random) { const unsigned int seed = std::stoul(seed_string); std::seed_seq seq{seed}; seq.generate(seeds.begin(), seeds.end()); } } unsigned int get_0() const { return is_random ? std::random_device{}() : seeds[0]; } unsigned int get_1() const { return is_random ? std::random_device{}() : seeds[1]; } unsigned int get_2() const { return is_random ? std::random_device{}() : seeds[2]; } private: std::array seeds; bool is_random; }; ROCPRIM_HOST inline rocprim::native_half half_to_native(const rocprim::half& x) { return *reinterpret_cast(&x); } ROCPRIM_HOST inline rocprim::half native_to_half(const rocprim::native_half& x) { return *reinterpret_cast(&x); } struct half_less { ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a < b; #else return half_to_native(a) < half_to_native(b); #endif } }; struct half_plus { ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a + b; #else return native_to_half(half_to_native(a) + half_to_native(b)); #endif } }; struct half_equal_to { ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a == b; #else return half_to_native(a) == half_to_native(b); #endif } }; // std::uniform_int_distribution is undefined for anything other than: // short, int, long, long long, unsigned short, unsigned int, unsigned long, or unsigned long long template struct is_valid_for_int_distribution : std::integral_constant::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value > {}; template using it_value_t = typename std::iterator_traits::value_type; using engine_type = std::minstd_rand; // generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template inline auto generate_random_data_n( OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) -> typename std::enable_if_t>::value, OutputIter> { using T = it_value_t; using dis_type = typename std::conditional< is_valid_for_int_distribution::value, T, typename std::conditional::value, int, unsigned int>::type >::type; std::uniform_int_distribution distribution((T)min, (T)max); std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(it, std::min(size - i, max_random_size), it + i); } return it + size; } template inline auto generate_random_data_n(OutputIterator it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value, OutputIterator> { using T = typename std::iterator_traits::value_type; // Generate floats when T is half using dis_type = std::conditional_t::value || std::is_same::value, float, T>; std::uniform_real_distribution distribution((dis_type)min, (dis_type)max); std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(it, std::min(size - i, max_random_size), it + i); } return it + size; } template inline std::vector get_random_data01(size_t size, float p, unsigned int seed, size_t max_random_size = 1024 * 1024) { engine_type gen(seed); std::bernoulli_distribution distribution(p); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template struct custom_type { using first_type = T; using second_type = U; T x; U y; ROCPRIM_HOST_DEVICE inline custom_type(T xx = 0, U yy = 0) : x(xx), y(yy) { } ROCPRIM_HOST_DEVICE inline ~custom_type() = default; ROCPRIM_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const { return custom_type(x + rhs.x, y + rhs.y); } ROCPRIM_HOST_DEVICE inline bool operator<(const custom_type& rhs) const { // intentionally suboptimal choice for short-circuting, // required to generate more performant device code return ((x == rhs.x && y < rhs.y) || x < rhs.x); } ROCPRIM_HOST_DEVICE inline bool operator==(const custom_type& rhs) const { return x == rhs.x && y == rhs.y; } ROCPRIM_HOST_DEVICE custom_type& operator+=(const custom_type& rhs) { this->x += rhs.x; this->y += rhs.y; return *this; } }; template struct is_custom_type : std::false_type {}; template struct is_custom_type> : std::true_type {}; template struct is_comparable { private: // A dummy template function that attempts to compare two objects of types T and U template static auto test(V&& v, W&& w) -> decltype(std::declval() < std::declval(), std::true_type{}); // Fallback if the above template function is not valid template static std::false_type test(...); public: // Final result static constexpr bool value = decltype(test(std::declval(), std::declval()))::value; }; template struct is_comparable, T> : std::conditional_t::value || !std::is_same>::value, std::false_type, std::true_type> {}; template struct custom_type_decomposer { static_assert(is_custom_type::value, "custom_type_decomposer can only be used with instantiations of custom_type"); using T = typename CustomType::first_type; using U = typename CustomType::second_type; __host__ __device__ ::rocprim::tuple operator()(CustomType& key) const { return ::rocprim::tuple{key.x, key.y}; } }; template struct generate_limits; template struct generate_limits::value>> { static inline T min() { return rocprim::numeric_limits::min(); } static inline T max() { return rocprim::numeric_limits::max(); } }; template struct generate_limits::value>> { using F = typename T::first_type; using S = typename T::second_type; static inline T min() { return T(generate_limits::min(), generate_limits::min()); } static inline T max() { return T(generate_limits::max(), generate_limits::max()); } }; template struct generate_limits::value>> { static inline T min() { return T(-1000); } static inline T max() { return T(1000); } }; template inline auto generate_random_data_n(OutputIterator it, size_t size, it_value_t min, it_value_t max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value, OutputIterator> { using T = it_value_t; using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector fdata(size); std::vector sdata(size); generate_random_data_n(fdata.begin(), size, min.x, max.x, gen, max_random_size); generate_random_data_n(sdata.begin(), size, min.y, max.y, gen, max_random_size); for(size_t i = 0; i < size; i++) { it[i] = T(fdata[i], sdata[i]); } return it + size; } template inline auto generate_random_data_n(OutputIterator it, size_t size, it_value_t min, it_value_t max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value && !std::is_same::value, OutputIterator> { using T = it_value_t; using field_type = decltype(max.x); std::vector field_data(size); generate_random_data_n(field_data.begin(), size, min.x, max.x, gen, max_random_size); for(size_t i = 0; i < size; i++) { it[i] = T(field_data[i]); } return it + size; } template inline std::vector get_random_data( size_t size, U min, V max, unsigned int seed, size_t max_random_size = 1024 * 1024) { std::vector data(size); engine_type gen(seed); generate_random_data_n(data.begin(), size, min, max, gen, max_random_size); return data; } template auto limit_cast(U value) -> T { static_assert(rocprim::is_arithmetic::value && rocprim::is_arithmetic::value && is_comparable::value, "Cannot use limit_cast with chosen types of T and U"); using common_type = typename std::common_type::type; if(rocprim::is_unsigned::value) { if(value < 0) { return rocprim::numeric_limits::min(); } if(static_cast(value) > static_cast(rocprim::numeric_limits::max())) { return rocprim::numeric_limits::max(); } } else if(rocprim::is_signed::value && rocprim::is_unsigned::value) { if(value > rocprim::numeric_limits::max()) { return rocprim::numeric_limits::max(); } } else if(rocprim::is_floating_point::value) { return static_cast(value); } else // Both T and U are signed { if(value < static_cast(rocprim::numeric_limits::min())) { return rocprim::numeric_limits::min(); } else if(value > static_cast(rocprim::numeric_limits::max())) { return rocprim::numeric_limits::max(); } } return static_cast(value); } // This overload below is selected for non-standard float types, e.g. half, which cannot be compared with the limit types. template inline auto limit_random_range(U range_start, V range_end) -> std::enable_if_t::value && (!is_comparable::value || !is_comparable::value), std::pair> { return {static_cast(range_start), static_cast(range_end)}; } template auto limit_random_range(U range_start, V range_end) -> std::enable_if_t<(is_custom_type::value && is_comparable::value && is_comparable::value && is_comparable::value && is_comparable::value && rocprim::is_arithmetic::value && rocprim::is_arithmetic::value && rocprim::is_arithmetic::value && rocprim::is_arithmetic::value), std::pair> { return { T{limit_cast(range_start), limit_cast(range_start)}, T{ limit_cast(range_end), limit_cast(range_end) } }; } template inline auto limit_random_range(U range_start, V range_end) -> std::enable_if_t::value && is_comparable::value && is_comparable::value, std::pair> { if(is_comparable::value) { using common_type = typename std::common_type::type; if(static_cast(range_start) > static_cast(range_end)) { throw std::range_error("limit_random_range: Incorrect range used!"); } } T start = limit_cast(range_start); T end = limit_cast(range_end); return std::make_pair(start, end); } inline bool is_warp_size_supported(const unsigned int required_warp_size, const int device_id) { unsigned int warp_size; HIP_CHECK(::rocprim::host_warp_size(device_id, warp_size)); return warp_size >= required_warp_size; } template __device__ constexpr bool device_test_enabled_for_warp_size_v = ::rocprim::arch::wavefront::min_size() >= LogicalWarpSize; /// \brief Get segments of uniform random size in [1, max_segment_length] with random key. template std::vector get_random_segments(const size_t size, const size_t max_segment_length, unsigned int seed) { static_assert(rocprim::is_arithmetic::value, "Key type must be arithmetic"); engine_type prng(seed); std::uniform_int_distribution segment_length_distribution( std::numeric_limits::min(), max_segment_length); // std::uniform_real_distribution cannot handle rocprim::half, use float instead using dis_type = typename std::conditional::value, float, T>::type; using key_distribution_type = std::conditional_t::value, std::uniform_int_distribution, std::uniform_real_distribution>; key_distribution_type key_distribution(rocprim::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill(keys.begin() + keys_start_index, keys.begin() + new_segment_end, key); keys_start_index += new_segment_length; } return keys; } /// \brief Get segments of uniform random size in [1, max_segment_length] with unique incrementing key. template std::vector get_random_segments_iota(const size_t size, const size_t max_segment_length, unsigned int seed) { engine_type prng(seed); std::uniform_int_distribution segment_length_distribution(1, max_segment_length); std::vector keys(size); size_t segment_index = 0; size_t keys_start_index = 0; while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = segment_index++; std::fill(keys.begin() + keys_start_index, keys.begin() + new_segment_end, key); keys_start_index += new_segment_length; } return keys; } template inline auto get_random_value(U min, V max, size_t seed_value) -> std::enable_if_t::value, T> { T result; engine_type gen(seed_value); generate_random_data_n(&result, 1, min, max, gen); return result; } template inline auto get_random_value(T min, T max, size_t seed_value) -> std::enable_if_t::value, T> { typename T::first_type result_first; typename T::second_type result_second; engine_type gen(seed_value); generate_random_data_n(&result_first, 1, min.x, max.x, gen); generate_random_data_n(&result_second, 1, min.y, max.y, gen); return T{result_first, result_second}; } template struct make_index_range_impl; template struct make_index_range_impl> { using type = std::integer_sequence; }; // make a std::integer_sequence with values from Start to End inclusive template using make_index_range = typename make_index_range_impl>::type; template class Function, T... I, typename... Args> void static_for_each_impl(std::integer_sequence, Args&&... args) { int a[] = {(Function{}(std::forward(args)...), 0)...}; static_cast(a); } // call the supplied template with all values of the std::integer_sequence Indices template class Function, typename... Args> void static_for_each(Args&&... args) { static_for_each_impl(Indices{}, std::forward(args)...); } #define REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance) \ benchmark::internal::Benchmark* benchmark = benchmark::RegisterBenchmark( \ instance.name().c_str(), \ [instance](benchmark::State& state, \ size_t _size, \ const managed_seed& _seed, \ hipStream_t _stream) { instance.run(state, _size, _seed, _stream); }, \ size, \ seed, \ stream); \ benchmarks.emplace_back(benchmark) struct config_autotune_interface { virtual std::string name() const = 0; virtual std::string sort_key() const { return name(); }; virtual ~config_autotune_interface() = default; virtual void run(benchmark::State&, size_t, const managed_seed&, hipStream_t) const = 0; }; struct config_autotune_register { static std::vector>& vector() { static std::vector> storage; return storage; } template static config_autotune_register create() { vector().push_back(std::make_unique()); return config_autotune_register(); } template static config_autotune_register create_bulk(BulkCreateFunction&& f) { std::forward(f)(vector()); return config_autotune_register(); } // Register a subset of all created benchmarks for the current parallel instance and add to vector. static void register_benchmark_subset(std::vector& benchmarks, int parallel_instance_index, int parallel_instance_count, size_t size, const managed_seed& seed, hipStream_t stream) { std::vector>& configs = vector(); // sorting to get a consistent order because order of initialization of static variables is undefined by the C++ standard. std::sort(configs.begin(), configs.end(), [](const auto& l, const auto& r) { return l->sort_key() < r->sort_key(); }); size_t configs_per_instance = (configs.size() + parallel_instance_count - 1) / parallel_instance_count; size_t start = std::min(parallel_instance_index * configs_per_instance, configs.size()); size_t end = std::min((parallel_instance_index + 1) * configs_per_instance, configs.size()); for(size_t i = start; i < end; i++) { std::unique_ptr& uniq_ptr = configs.at(i); config_autotune_interface* tuning_benchmark = uniq_ptr.get(); benchmark::internal::Benchmark* benchmark = benchmark::RegisterBenchmark( tuning_benchmark->name().c_str(), [tuning_benchmark](benchmark::State& state, size_t size, const managed_seed& seed, hipStream_t stream) { tuning_benchmark->run(state, size, seed, stream); }, size, seed, stream); benchmarks.emplace_back(benchmark); } } }; // Inserts spaces at beginning of string if string shorter than specified length. inline std::string pad_string(std::string str, const size_t len) { if(len > str.size()) { str.insert(str.begin(), len - str.size(), ' '); } return str; } struct bench_naming { public: enum format { json, human, txt }; static format& get_format() { static format storage = human; return storage; } static void set_format(const std::string& argument) { format result = human; if(argument == "json") { result = json; } else if(argument == "txt") { result = txt; } get_format() = result; } private: static std::string matches_as_json(std::sregex_iterator& matches) { std::stringstream result; int brackets_count = 1; result << "{"; bool insert_comma = false; for(std::sregex_iterator i = matches; i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(insert_comma) { result << ","; } else { insert_comma = true; } result << "\"" << m[1].str() << "\":"; if(m[2].length() > 0) { if(m[2].str().find_first_not_of("0123456789") == std::string::npos) { result << m[2].str(); } else { result << "\"" << m[2].str() << "\""; } if(m[3].length() > 0 && brackets_count > 0) { int n = std::min(brackets_count, static_cast(m[3].length())); brackets_count -= n; for(int c = 0; c < n; c++) { result << "}"; } } } else { brackets_count++; result << "{"; insert_comma = false; } } while(brackets_count > 0) { brackets_count--; result << "}"; } return result.str(); } static std::string matches_as_human(std::sregex_iterator& matches) { std::stringstream result; int brackets_count = 0; bool insert_comma = false; for(std::sregex_iterator i = matches; i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(insert_comma) { result << ","; } else { insert_comma = true; } if(m[2].length() > 0) { result << m[2].str(); if(m[3].length() > 0 && brackets_count > 0) { int n = std::min(brackets_count, static_cast(m[3].length())); brackets_count -= n; for(int c = 0; c < n; c++) { result << ">"; } } } else { brackets_count++; result << "<"; insert_comma = false; } } while(brackets_count > 0) { brackets_count--; result << ">"; } return result.str(); } public: static std::string format_name(std::string string) { format format = get_format(); std::regex r("([A-z0-9]*):\\s*((?:custom_type<[A-z0-9,]*>)|[A-z:\\(\\)\\.<>\\s0-9]*)(\\}*)"); // First we perform some checks bool checks[4] = {false}; for(std::sregex_iterator i = std::sregex_iterator(string.begin(), string.end(), r); i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(m[1].str() == "lvl") { checks[0] = true; } else if(m[1].str() == "algo") { checks[1] = true; } else if(m[1].str() == "cfg") { checks[2] = true; } } std::string string_substitute = std::regex_replace(string, r, ""); checks[3] = string_substitute.find_first_not_of(" ,{}") == std::string::npos; for(bool check_name_format : checks) { if(!check_name_format) { std::cout << "Benchmark name \"" << string << "\" not in the correct format (e.g. " "{lvl:block,algo:reduce,cfg:default_config} )" << std::endl; exit(1); } } // Now we generate the desired format std::sregex_iterator matches = std::sregex_iterator(string.begin(), string.end(), r); switch(format) { case format::json: return matches_as_json(matches); case format::human: return matches_as_human(matches); case format::txt: return string; } return string; } }; template struct Traits { //static inline method instead of static inline attribute because that's only supported from C++17 onwards static inline const char* name(){ static_assert(sizeof(T) == 0, "Traits::name() unknown"); return "unknown"; } }; // Explicit definitions template<> inline const char* Traits::name() { return "char"; } template <> inline const char* Traits::name() { return "int"; } template <> inline const char* Traits::name() { return "short"; } template <> inline const char* Traits::name() { return "int8_t"; } template <> inline const char* Traits::name() { return "uint8_t"; } template<> inline const char* Traits::name() { return "uint16_t"; } template<> inline const char* Traits::name() { return "uint32_t"; } template<> inline const char* Traits::name() { return "rocprim::half"; } template<> inline const char* Traits::name() { return "rocprim::bfloat16"; } template<> inline const char* Traits::name() { return "int64_t"; } // On MSVC `int64_t` and `long long` are the same, leading to multiple definition errors #ifndef _WIN32 template <> inline const char* Traits::name() { return "int64_t"; } #endif template <> inline const char* Traits::name() { return "float"; } template <> inline const char* Traits::name() { return "double"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits::name() { return "empty_type"; } template<> inline const char* Traits>::name() { return "float2"; } template<> inline const char* Traits>::name() { return "double2"; } inline void add_common_benchmark_info() { hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); auto str = [](const std::string& name, const std::string& val) { benchmark::AddCustomContext(name, val); }; auto num = [](const std::string& name, const auto& value) { benchmark::AddCustomContext(name, std::to_string(value)); }; auto dim2 = [num](const std::string& name, const auto* values) { num(name + "_x", values[0]); num(name + "_y", values[1]); }; auto dim3 = [num, dim2](const std::string& name, const auto* values) { dim2(name, values); num(name + "_z", values[2]); }; str("hdp_name", devProp.name); num("hdp_total_global_mem", devProp.totalGlobalMem); num("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); num("hdp_regs_per_block", devProp.regsPerBlock); num("hdp_warp_size", devProp.warpSize); num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); dim3("hdp_max_threads_dim", devProp.maxThreadsDim); dim3("hdp_max_grid_size", devProp.maxGridSize); num("hdp_clock_rate", devProp.clockRate); num("hdp_memory_clock_rate", devProp.memoryClockRate); num("hdp_memory_bus_width", devProp.memoryBusWidth); num("hdp_total_const_mem", devProp.totalConstMem); num("hdp_major", devProp.major); num("hdp_minor", devProp.minor); num("hdp_multi_processor_count", devProp.multiProcessorCount); num("hdp_l2_cache_size", devProp.l2CacheSize); num("hdp_max_threads_per_multiprocessor", devProp.maxThreadsPerMultiProcessor); num("hdp_compute_mode", devProp.computeMode); num("hdp_clock_instruction_rate", devProp.clockInstructionRate); num("hdp_concurrent_kernels", devProp.concurrentKernels); num("hdp_pci_domain_id", devProp.pciDomainID); num("hdp_pci_bus_id", devProp.pciBusID); num("hdp_pci_device_id", devProp.pciDeviceID); num("hdp_max_shared_memory_per_multi_processor", devProp.maxSharedMemoryPerMultiProcessor); num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); num("hdp_can_map_host_memory", devProp.canMapHostMemory); str("hdp_gcn_arch_name", devProp.gcnArchName); num("hdp_integrated", devProp.integrated); num("hdp_cooperative_launch", devProp.cooperativeLaunch); num("hdp_cooperative_multi_device_launch", devProp.cooperativeMultiDeviceLaunch); num("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); num("hdp_max_texture_1d", devProp.maxTexture1D); dim2("hdp_max_texture_2d", devProp.maxTexture2D); dim3("hdp_max_texture_3d", devProp.maxTexture3D); num("hdp_mem_pitch", devProp.memPitch); num("hdp_texture_alignment", devProp.textureAlignment); num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); num("hdp_ecc_enabled", devProp.ECCEnabled); num("hdp_tcc_driver", devProp.tccDriver); num("hdp_cooperative_multi_device_unmatched_func", devProp.cooperativeMultiDeviceUnmatchedFunc); num("hdp_cooperative_multi_device_unmatched_grid_dim", devProp.cooperativeMultiDeviceUnmatchedGridDim); num("hdp_cooperative_multi_device_unmatched_block_dim", devProp.cooperativeMultiDeviceUnmatchedBlockDim); num("hdp_cooperative_multi_device_unmatched_shared_mem", devProp.cooperativeMultiDeviceUnmatchedSharedMem); num("hdp_is_large_bar", devProp.isLargeBar); num("hdp_asic_revision", devProp.asicRevision); num("hdp_managed_memory", devProp.managedMemory); num("hdp_direct_managed_mem_access_from_host", devProp.directManagedMemAccessFromHost); num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); num("hdp_pageable_memory_access_uses_host_page_tables", devProp.pageableMemoryAccessUsesHostPageTables); const auto arch = devProp.arch; num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); num("hdp_arch_has_doubles", arch.hasDoubles); num("hdp_arch_has_warp_vote", arch.hasWarpVote); num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); num("hdp_arch_has_3d_grid", arch.has3dGrid); num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); } inline const char* get_block_scan_algorithm_name(rocprim::block_scan_algorithm alg) { switch(alg) { case rocprim::block_scan_algorithm::using_warp_scan: return "block_scan_algorithm::using_warp_scan"; case rocprim::block_scan_algorithm::reduce_then_scan: return "block_scan_algorithm::reduce_then_scan"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "default_algorithm"; } inline const char* get_block_load_method_name(rocprim::block_load_method method) { switch(method) { case rocprim::block_load_method::block_load_direct: return "block_load_method::block_load_direct"; case rocprim::block_load_method::block_load_striped: return "block_load_method::block_load_striped"; case rocprim::block_load_method::block_load_vectorize: return "block_load_method::block_load_vectorize"; case rocprim::block_load_method::block_load_transpose: return "block_load_method::block_load_transpose"; case rocprim::block_load_method::block_load_warp_transpose: return "block_load_method::block_load_warp_transpose"; } return "default_method"; } template struct alignas(Alignment) custom_aligned_type { unsigned char data[Size]; }; template std::string partition_config_name() { const rocprim::detail::partition_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string partition_config_name() { return "default_config"; } #endif // ROCPRIM_BENCHMARK_UTILS_HPP_ rocPRIM-rocm-6.4.3/benchmark/benchmark_warp_exchange.cpp000066400000000000000000000361071502235215600232300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif struct BlockedToStripedOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.blocked_to_striped(items, items, storage); } }; struct StripedToBlockedOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.striped_to_blocked(items, items, storage); } }; struct BlockedToStripedShuffleOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) const { warp_exchange.blocked_to_striped_shuffle(items, items); } }; struct StripedToBlockedShuffleOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) const { warp_exchange.striped_to_blocked_shuffle(items, items); } }; struct ScatterToStripedOp { template< class T, class OffsetT, class warp_exchange_type, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.scatter_to_striped(thread_data, thread_data, ranks, storage); } }; template __device__ auto warp_exchange_benchmark(T* d_output, unsigned int trials) -> std::enable_if_t && !std::is_same::value> { T thread_data[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { // generate unique value each data-element thread_data[i] = static_cast(threadIdx.x * ItemsPerThread + i); } using warp_exchange_type = ::rocprim::warp_exchange; constexpr unsigned int warps_in_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; ROCPRIM_SHARED_MEMORY typename warp_exchange_type::storage_type storage[warps_in_block]; ROCPRIM_NO_UNROLL for(unsigned int i = 0; i < trials; i++) { Op{}(warp_exchange_type(), thread_data, storage[warp_id]); ::rocprim::wave_barrier(); } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { const unsigned int global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template __device__ auto warp_exchange_benchmark(T* d_output, unsigned int trials) -> std::enable_if_t && std::is_same::value> { T thread_data[ItemsPerThread]; unsigned int thread_ranks[ItemsPerThread]; constexpr unsigned int warps_in_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; const unsigned int lane_id = threadIdx.x % LogicalWarpSize; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { // generate unique value each data-element thread_data[i] = static_cast(threadIdx.x * ItemsPerThread + i); // generate unique destination location for each data-element const unsigned int s_lane_id = i % 2 == 0 ? LogicalWarpSize - 1 - lane_id : lane_id; thread_ranks[i] = s_lane_id*ItemsPerThread+i; // scatter values in warp across whole storage } using warp_exchange_type = ::rocprim::warp_exchange; ROCPRIM_SHARED_MEMORY typename warp_exchange_type::storage_type storage[warps_in_block]; ROCPRIM_NO_UNROLL for(unsigned int i = 0; i < trials; i++) { Op{}(warp_exchange_type(), thread_data, thread_ranks, storage[warp_id]); ::rocprim::wave_barrier(); } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { const unsigned int global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template __device__ auto warp_exchange_benchmark(T* /*d_output*/, unsigned int /*trials*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T* d_output, unsigned int trials) { warp_exchange_benchmark(d_output, trials); } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int LogicalWarpSize, class Op > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) { // Calculate the number of elements size_t N = bytes / sizeof(T); constexpr unsigned int trials = 200; constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); T * d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); warp_exchange_kernel <<>>(d_output, trials); HIP_CHECK(hipPeekAtLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * trials * size); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IT, WS, OP) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:warp,algo:exchange,key_type:" #T \ ",operation:" #OP ",ws:" #WS \ ",cfg:{bs:" #BS ",ipt:" #IT "}}") \ .c_str(), \ &run_benchmark, \ stream, \ bytes) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, 256, 1, 16, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 1, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 4, 16, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 4, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 16, 16, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 16, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 32, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 1, 16, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 1, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 4, 16, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 4, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 16, 16, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 16, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 32, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 1, 16, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 16, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 16, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 32, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 16, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 16, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 16, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 32, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 16, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 1, 32, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 4, 16, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 4, 32, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 16, 16, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 16, 32, ScatterToStripedOp)}; int hip_device = 0; HIP_CHECK(::rocprim::detail::get_device_from_stream(stream, hip_device)); if(is_warp_size_supported(64, hip_device)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 1, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 4, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 16, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 64, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 1, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 4, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 16, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 64, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 1, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 64, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 64, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 64, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 4, 64, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 16, 64, ScatterToStripedOp)}; benchmarks.insert( benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end() ); } // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for (auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_warp_reduce.cpp000066400000000000000000000242501502235215600227110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif template< bool AllReduce, class T, unsigned int WarpSize, unsigned int Trials > __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_reduce_kernel(const T * d_input, T * d_output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().reduce(value, value, storage); } d_output[i] = value; } template< class T, class Flag, unsigned int WarpSize, unsigned int Trials > __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; auto flag = d_flags[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().head_segmented_reduce(value, value, flag, storage); } d_output[i] = value; } template< bool AllReduce, bool Segmented, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials, class T, class Flag > inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_reduce_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, input, output ); HIP_CHECK(hipGetLastError()); } template< bool AllReduce, bool Segmented, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials, class T, class Flag > inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(segmented_warp_reduce_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, input, flags, output ); HIP_CHECK(hipGetLastError()); } template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { using flag_type = unsigned char; // Calculate the number of elements size_t N = bytes / sizeof(T); const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize); const auto random_range = limit_random_range(0, 10); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); std::vector flags = get_random_data(size, 0, 1, seed.get_1()); T * d_input; flag_type * d_flags; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), size * sizeof(flag_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); execute_warp_reduce_kernel(d_input, d_output, d_flags, size, stream); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_flags)); } #define CREATE_BENCHMARK(T, WS, BS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:warp,algo:reduce,key_type:" #T ",broadcast_result:" \ + std::string(AllReduce ? "true" : "false") \ + ",segmented:" + std::string(Segmented ? "true" : "false") \ + ",ws:" #WS ",cfg:{bs:" #BS "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 32, 64), \ CREATE_BENCHMARK(type, 37, 64), \ CREATE_BENCHMARK(type, 61, 64), \ CREATE_BENCHMARK(type, 64, 64) template void add_benchmarks(std::vector& benchmarks, size_t bytes, const managed_seed& seed, hipStream_t stream) { std::vector bs = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half) }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, bytes, seed, stream); add_benchmarks(benchmarks, bytes, seed, stream); add_benchmarks(benchmarks, bytes, seed, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_warp_scan.cpp000066400000000000000000000204631502235215600223700ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 * 4; #endif namespace rp = rocprim; template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_inclusive_scan_kernel(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().inclusive_scan(value, value, storage); } output[i] = value; } template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_exclusive_scan_kernel(const T* input, T* output, const T init) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().exclusive_scan(value, value, init, storage); } output[i] = value; } template< class T, unsigned int BlockSize, unsigned int WarpSize, bool Inclusive = true, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t bytes) { // Calculate the number of elements size_t size = bytes / sizeof(T); // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1)/BlockSize); // Allocate and fill memory std::vector input(size, (T)1); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); if(Inclusive) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_inclusive_scan_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, d_input, d_output ); } else { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exclusive_scan_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, d_input, d_output, input[0] ); } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, WS, INCLUSIVE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:warp,algo:scan,key_type:" #T ",subalgo:" \ + std::string(Inclusive ? "inclusive" : "exclusive") \ + ",ws:" #WS ",cfg:{bs:" #BS "}}") \ .c_str(), \ run_benchmark, \ stream, \ bytes) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 64, 64, Inclusive), \ CREATE_BENCHMARK(type, 128, 64, Inclusive), \ CREATE_BENCHMARK(type, 256, 64, Inclusive), \ CREATE_BENCHMARK(type, 256, 32, Inclusive), \ CREATE_BENCHMARK(type, 256, 16, Inclusive), \ CREATE_BENCHMARK(type, 63, 63, Inclusive), \ CREATE_BENCHMARK(type, 62, 31, Inclusive), \ CREATE_BENCHMARK(type, 60, 15, Inclusive) template void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t bytes) { using custom_double2 = custom_type; using custom_int_double = custom_type; std::vector new_benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_TYPE(custom_double2), BENCHMARK_TYPE(custom_int_double) }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, stream, bytes); //inclusive add_benchmarks(benchmarks, stream, bytes); //exclusive // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/benchmark_warp_sort.cpp000066400000000000000000000306221502235215600224310ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef DEFAULT_BYTES const size_t DEFAULT_BYTES = 1024 * 1024 * 32 *4; #endif namespace rp = rocprim; template __global__ __launch_bounds__(BlockSize) void warp_sort_kernel(K* input_keys, K* output_keys) { const unsigned int flat_tid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; K keys[ItemsPerThread]; rp::block_load_direct_striped(flat_tid, input_keys + block_offset, keys); rp::warp_sort wsort; wsort.sort(keys); rp::block_store_direct_blocked(flat_tid, output_keys + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void warp_sort_by_key_kernel(K* input_keys, V* input_values, K* output_keys, V* output_values) { const unsigned int flat_tid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; K keys[ItemsPerThread]; V values[ItemsPerThread]; rp::block_load_direct_striped(flat_tid, input_keys + block_offset, keys); rp::block_load_direct_striped(flat_tid, input_values + block_offset, values); rp::warp_sort wsort; wsort.sort(keys, values); rp::block_store_direct_blocked(flat_tid, output_keys + block_offset, keys); rp::block_store_direct_blocked(flat_tid, output_values + block_offset, values); } template void run_benchmark(benchmark::State& state, size_t bytes, const managed_seed& seed, hipStream_t stream) { // Calculate the number of elements size_t size = bytes / sizeof(Key); // Make sure size is a multiple of items_per_block constexpr auto items_per_block = BlockSize * ItemsPerThread; size = BlockSize * ((size + items_per_block - 1) / items_per_block); // Allocate and fill memory const auto random_range = limit_random_range(0, 10'000); std::vector input_key = get_random_data(size, random_range.first, random_range.second, seed.get_0()); std::vector input_value(size_t(1)); if(SortByKey) { const auto random_range = limit_random_range(0, 10'000); input_value = get_random_data(size, random_range.first, random_range.second, seed.get_1()); } Key * d_input_key = nullptr; Key * d_output_key = nullptr; Value * d_input_value = nullptr; Value * d_output_value = nullptr; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_key), size * sizeof(Key))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output_key), size * sizeof(Key))); if(SortByKey) { HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_value), size * sizeof(Value))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output_value), size * sizeof(Value))); } HIP_CHECK( hipMemcpy( d_input_key, input_key.data(), size * sizeof(Key), hipMemcpyHostToDevice ) ); if(SortByKey) HIP_CHECK( hipMemcpy( d_input_value, input_value.data(), size * sizeof(Value), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); if(SortByKey) { ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_sort_by_key_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input_key, d_input_value, d_output_key, d_output_value ); } } else { ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_sort_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input_key, d_output_key ); } } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); // SortByKey also transfers values auto sorted_type_size = sizeof(Key); if(SortByKey) sorted_type_size += sizeof(Value); state.SetBytesProcessed(state.iterations() * size * sorted_type_size * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input_key)); HIP_CHECK(hipFree(d_output_key)); HIP_CHECK(hipFree(d_input_value)); HIP_CHECK(hipFree(d_output_value)); } #define CREATE_SORT_BENCHMARK(K, BS, WS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:warp,algo:sort,key_type:" #K ",value_type:" \ + std::string(Traits::name()) \ + ",ws:" #WS ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) #define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS, IPT) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:warp,algo:sort,key_type:" #K \ ",value_type:" #V ",ws:" #WS \ ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ bytes, \ seed, \ stream) #define BENCHMARK_TYPE(type) \ CREATE_SORT_BENCHMARK(type, 64, 64, 1), \ CREATE_SORT_BENCHMARK(type, 64, 64, 2), \ CREATE_SORT_BENCHMARK(type, 64, 64, 4), \ CREATE_SORT_BENCHMARK(type, 128, 64, 1), \ CREATE_SORT_BENCHMARK(type, 128, 64, 2), \ CREATE_SORT_BENCHMARK(type, 128, 64, 4), \ CREATE_SORT_BENCHMARK(type, 256, 64, 1), \ CREATE_SORT_BENCHMARK(type, 256, 64, 2), \ CREATE_SORT_BENCHMARK(type, 256, 64, 4), \ CREATE_SORT_BENCHMARK(type, 64, 32, 1), \ CREATE_SORT_BENCHMARK(type, 64, 32, 2), \ CREATE_SORT_BENCHMARK(type, 64, 16, 1), \ CREATE_SORT_BENCHMARK(type, 64, 16, 2), \ CREATE_SORT_BENCHMARK(type, 64, 16, 4) #define BENCHMARK_KEY_TYPE(type, value) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 1), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 2), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 4), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 1), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 2), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 4) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t bytes = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); const std::string seed_type = parser.get("seed"); const managed_seed seed(seed_type); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("bytes", std::to_string(bytes)); benchmark::AddCustomContext("seed", seed_type); using custom_double2 = custom_type; using custom_int_double = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; std::vector benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_KEY_TYPE(float, float), BENCHMARK_KEY_TYPE(unsigned int, int), BENCHMARK_KEY_TYPE(int, custom_double2), BENCHMARK_KEY_TYPE(int, custom_int_double), BENCHMARK_KEY_TYPE(custom_int2, custom_double2), BENCHMARK_KEY_TYPE(custom_int2, custom_char_double), BENCHMARK_KEY_TYPE(custom_int2, custom_longlong_double), BENCHMARK_KEY_TYPE(int8_t, int8_t), BENCHMARK_KEY_TYPE(uint8_t, uint8_t), BENCHMARK_KEY_TYPE(rocprim::half, rocprim::half) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-6.4.3/benchmark/cmdparser.hpp000066400000000000000000000420621502235215600203650ustar00rootroot00000000000000// The MIT License (MIT) // // Copyright (c) 2015 - 2016 Florian Rappl // Modifications Copyright (c) 2019-2024, Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /* This file is part of the C++ CmdParser utility. Copyright (c) 2015 - 2016 Florian Rappl */ #pragma once #include #include #include #include #include #include namespace cli { struct CallbackArgs { const std::vector& arguments; std::ostream& output; std::ostream& error; }; class Parser { private: class CmdBase { public: explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : name(name), command(name.size() > 0 ? "-" + name : ""), alternative(alternative.size() > 0 ? "--" + alternative : ""), description(description), required(required), handled(false), arguments({}), dominant(dominant), variadic(variadic) { } virtual ~CmdBase() { } std::string name; std::string command; std::string alternative; std::string description; bool required; bool handled; std::vector arguments; bool const dominant; bool const variadic; virtual std::string print_value() const = 0; virtual bool parse(std::ostream& output, std::ostream& error) = 0; bool is(const std::string& given) const { return given == command || given == alternative; } }; template struct ArgumentCountChecker { static constexpr bool Variadic = false; }; template struct ArgumentCountChecker> { static constexpr bool Variadic = true; }; template class CmdFunction final : public CmdBase { public: explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream& output, std::ostream& error) override { try { CallbackArgs args { arguments, output, error }; value = callback(args); return true; } catch (...) { return false; } } virtual std::string print_value() const override { return ""; } std::function callback; T value; }; template class CmdArgument final : public CmdBase { public: explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic), value(T()) { } virtual bool parse(std::ostream&, std::ostream&) override { try { value = Parser::parse(arguments, value); return true; } catch (...) { return false; } } virtual std::string print_value() const override { return stringify(value); } T value; }; static int parse(const std::vector& elements, const int&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoi(elements[0]); } static bool parse(const std::vector& elements, const bool& defval) { if (elements.size() != 0) throw std::runtime_error("A boolean command line parameter cannot have any arguments."); return !defval; } static double parse(const std::vector& elements, const double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stod(elements[0]); } static float parse(const std::vector& elements, const float&) { if (elements.size() != 1) throw std::bad_cast(); return std::stof(elements[0]); } static long double parse(const std::vector& elements, const long double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stold(elements[0]); } static unsigned int parse(const std::vector& elements, const unsigned int&) { if (elements.size() != 1) throw std::bad_cast(); return static_cast(std::stoul(elements[0])); } static unsigned long parse(const std::vector& elements, const unsigned long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoul(elements[0]); } static unsigned long long parse(const std::vector& elements, const unsigned long long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoull(elements[0]); } static long parse(const std::vector& elements, const long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stol(elements[0]); } static std::string parse(const std::vector& elements, const std::string&) { if (elements.size() != 1) throw std::bad_cast(); return elements[0]; } template static std::vector parse(const std::vector& elements, const std::vector&) { const T defval = T(); std::vector values { }; std::vector buffer(1); for (const auto& element : elements) { buffer[0] = element; values.push_back(parse(buffer, defval)); } return values; } template static std::string stringify(const T& value) { return std::to_string(value); } template static std::string stringify(const std::vector& values) { std::stringstream ss { }; ss << "[ "; for (const auto& value : values) { ss << stringify(value) << " "; } ss << "]"; return ss.str(); } static std::string stringify(const std::string& str) { return str; } public: explicit Parser(int argc, const char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } explicit Parser(int argc, char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } ~Parser() { for (int i = 0, n = _commands.size(); i < n; ++i) { delete _commands[i]; } } bool has_help() const { for (const auto command : _commands) { if (command->name == "h" && command->alternative == "--help") { return true; } } return false; } void enable_help() { set_callback("h", "help", std::function([this](CallbackArgs& args){ args.output << this->usage(); /*exit(0);*/ return false; }), "", true); } void disable_help() { for (auto command = _commands.begin(); command != _commands.end(); ++command) { if ((*command)->name == "h" && (*command)->alternative == "--help") { _commands.erase(command); break; } } } template void set_default(bool is_required, const std::string& description = "") { auto command = new CmdArgument { "", "", description, is_required, false }; _commands.push_back(command); } template void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, true, dominant }; _commands.push_back(command); } template void set_optional(const std::string& name, const std::string& alternative, const T& defaultValue, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, false, dominant }; command->value = defaultValue; _commands.push_back(command); } template void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { auto command = new CmdFunction { name, alternative, description, false, dominant }; command->callback = callback; _commands.push_back(command); } inline void run_and_exit_if_error() { if (run() == false) { exit(1); } } inline bool run() { return run(std::cout, std::cerr); } inline bool run(std::ostream& output) { return run(output, std::cerr); } bool run(std::ostream& output, std::ostream& error) { if (_arguments.size() > 0) { auto current = find_default(); for (int i = 0, n = _arguments.size(); i < n; ++i) { auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; auto associated = isarg ? find(_arguments[i]) : nullptr; if (associated != nullptr) { current = associated; associated->handled = true; } else if (current == nullptr) { current = find(_arguments[i]); // Code was commented out so cmdparser can ignore unknown options // error << no_default(); // return false; } else { current->arguments.push_back(_arguments[i]); current->handled = true; if (!current->variadic) { // If the current command is not variadic, then no more arguments // should be added to it. In this case, switch back to the default // command. current = find_default(); } } } } // First, parse dominant arguments since they succeed even if required // arguments are missing. for (auto command : _commands) { if (command->handled && command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } // Next, check for any missing arguments. for (auto command : _commands) { if (command->required && !command->handled) { error << howto_required(command); return false; } } // Finally, parse all remaining arguments. for (auto command : _commands) { if (command->handled && !command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } return true; } template T get(const std::string& name) const { for (const auto& command : _commands) { if (command->name == name) { auto cmd = dynamic_cast*>(command); if (cmd == nullptr) { throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); } return cmd->value; } } throw std::runtime_error("The parameter " + name + " could not be found."); } template T get_if(const std::string& name, std::function callback) const { auto value = get(name); return callback(value); } int requirements() const { int count = 0; for (const auto& command : _commands) { if (command->required) { ++count; } } return count; } int commands() const { return static_cast(_commands.size()); } inline const std::string& app_name() const { return _appname; } protected: CmdBase* find(const std::string& name) { for (auto command : _commands) { if (command->is(name)) { return command; } } return nullptr; } CmdBase* find_default() { for (auto command : _commands) { if (command->name == "") { return command; } } return nullptr; } std::string usage() const { std::stringstream ss { }; ss << "Available parameters:\n\n"; for (const auto& command : _commands) { ss << " " << command->command << "\t" << command->alternative; if (command->required == true) { ss << "\t(required)"; } ss << "\n " << command->description; if (command->required == false) { ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; } ss << "\n\n"; } return ss.str(); } void print_help(std::stringstream& ss) const { if (has_help()) { ss << "For more help use --help or -h.\n"; } } std::string howto_required(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " is required.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string howto_use(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " has invalid arguments.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string no_default() const { std::stringstream ss { }; ss << "No default parameter has been specified.\n"; ss << "The given argument must be used with a parameter.\n"; print_help(ss); return ss.str(); } private: const std::string _appname; std::vector _arguments; std::vector _commands; }; } rocPRIM-rocm-6.4.3/cmake/000077500000000000000000000000001502235215600150165ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/cmake/ConfigAutotune.cmake000066400000000000000000000116321502235215600207550ustar00rootroot00000000000000# MIT License # # Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Function to add a configured source file to a target. # It parses arguments, prepares the output file name, and configures the file. function(add_configured_source) # Parse arguments and ensure proper usage cmake_parse_arguments(PARSE_ARGV 0 ARG "" "INPUT;TARGET;OUTPUT_PATTERN" "NAMES;VALUES") list(LENGTH ARG_NAMES NAMES_LEN) list(LENGTH ARG_VALUES VALS_LEN) if (NOT NAMES_LEN EQUAL VALS_LEN) message(FATAL_ERROR "add_configured_source: The same number of names (${NAMES_LEN}) and values (${VALS_LEN}) must be provided!") endif() # Loop through the names and values, preparing the output pattern set(max ${VALS_LEN}) math(EXPR max "${max} - 1") foreach(i RANGE ${max}) list(GET ARG_NAMES ${i} curr_name) list(GET ARG_VALUES ${i} "${curr_name}") endforeach() # Configure the output file and add it to the target string(CONFIGURE "${ARG_OUTPUT_PATTERN}" output @ONLY) string(MAKE_C_IDENTIFIER ${output} output) set(output_path "${ARG_TARGET}.parallel/${output}.cpp") configure_file("${ARG_INPUT}" "${output_path}" @ONLY) set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_CLEAN_FILES "${ARG_TARGET}.parallel") target_sources("${ARG_TARGET}" PRIVATE "${output_path}") target_include_directories("${ARG_TARGET}" PRIVATE "../benchmark") # Ensure reconfiguration if necessary set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${ARG_INPUT}" "${output_path}") endfunction() # Function to divide two numbers and round up. function(div_round_up dividend divisor result_var) math(EXPR result "(${dividend} + ${divisor} - 1) / ${divisor}") set("${result_var}" "${result}" PARENT_SCOPE) endfunction() # Function to add a matrix of configured sources. # It handles permutations of input parameters and calls add_configured_source accordingly. function(add_matrix) set(single_value_args "TARGET" "INPUT" "OUTPUT_PATTERN" "SHARDS" "CURRENT_SHARD") cmake_parse_arguments(PARSE_ARGV 0 ARG "" "${single_value_args}" "NAMES;LISTS") # Validate argument lengths list(LENGTH ARG_NAMES NAMES_LEN) list(LENGTH ARG_LISTS LISTS_LEN) if (NOT NAMES_LEN EQUAL LISTS_LEN) message(FATAL_ERROR "add_matrix: The same number of names (${NAMES_LEN}) and lists (${LISTS_LEN}) must be provided!") endif() # Calculate the total number of permutations set(total_len 1) foreach(LIST IN LISTS ARG_LISTS) string(REPLACE " " ";" list ${LIST}) list(LENGTH list LIST_LEN) math(EXPR total_len "${total_len} * ${LIST_LEN}") endforeach() # Handle sharding if(NOT DEFINED ARG_SHARDS) set(ARG_SHARDS 1) endif() div_round_up("${total_len}" "${ARG_SHARDS}" per_shard) # Determine the range of permutations for the current shard math(EXPR start "${ARG_CURRENT_SHARD} * ${per_shard}") math(EXPR stop "${start} + ${per_shard} - 1") # Process each permutation foreach(i RANGE ${start} ${stop}) set(index ${i}) set(values "") foreach(input_list IN LISTS ARG_LISTS) string(REPLACE " " ";" curr_list ${input_list}) list(LENGTH curr_list curr_length) math(EXPR curr_index "${index} % ${curr_length}") list(GET curr_list ${curr_index} curr_item) list(APPEND values "${curr_item}") math(EXPR index "${index} / ${curr_length}") endforeach() # Add the configured source for each permutation add_configured_source(TARGET "${ARG_TARGET}" INPUT "${ARG_INPUT}" OUTPUT_PATTERN "${ARG_OUTPUT_PATTERN}" NAMES ${ARG_NAMES} VALUES ${values}) endforeach() endfunction() # Function to filter out odd block sizes. # It sets a variable in the parent scope based on the condition. function(reject_odd_blocksize RESULT BlockSize) math(EXPR res "${BlockSize} % 2") if(res EQUAL 0) set("${RESULT}" ON PARENT_SCOPE) else() set("${RESULT}" OFF PARENT_SCOPE) endif() endfunction()rocPRIM-rocm-6.4.3/cmake/Dependencies.cmake000066400000000000000000000175601502235215600204170ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ########################### # rocPRIM dependencies # ########################### # NOTE1: the reason we don't scope global state meddling using add_subdirectory # is because CMake < 3.24 lacks CMAKE_FIND_PACKAGE_TARGETS_GLOBAL which # would promote IMPORTED targets of find_package(CONFIG) to be visible # by other parts of the build. So we save and restore global state. # # NOTE2: We disable the ROCMChecks.cmake warning noting that we meddle with # global state. This is consequence of abusing the CMake CXX language # which HIP piggybacks on top of. This kind of HIP support has one chance # at observing the global flags, at the find_package(HIP) invocation. # The device compiler won't be able to pick up changes after that, hence # the warning. set(USER_CXX_FLAGS ${CMAKE_CXX_FLAGS}) if(DEFINED BUILD_SHARED_LIBS) set(USER_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) endif() set(USER_ROCM_WARN_TOOLCHAIN_VAR ${ROCM_WARN_TOOLCHAIN_VAR}) set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "") # Turn off warnings and errors for all warnings in dependencies separate_arguments(CXX_FLAGS_LIST NATIVE_COMMAND ${CMAKE_CXX_FLAGS}) list(REMOVE_ITEM CXX_FLAGS_LIST /WX -Werror -Werror=pendantic -pedantic-errors) if(MSVC) list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "/[Ww]([0-4]?)(all)?") # Remove MSVC warning flags list(APPEND CXX_FLAGS_LIST /w) else() list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "-W(all|extra|everything)") # Remove GCC/LLVM flags list(APPEND CXX_FLAGS_LIST -w) endif() list(JOIN CXX_FLAGS_LIST " " CMAKE_CXX_FLAGS) # Don't build client dependencies as shared set(BUILD_SHARED_LIBS OFF CACHE BOOL "Global flag to cause add_library() to create shared libraries if on." FORCE) # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included. include(FetchContent) # Test dependencies if(BUILD_TEST) # NOTE1: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake # # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND # # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main # # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets # will be duplicately defined. # # NOTE3: The following snippet first tries to find Google Test binary either in MODULE or CONFIG modes. # If neither succeeds it goes on to import Google Test into this build either from a system # source package (apt install googletest on Ubuntu 18.04 only) or GitHub and defines the MODULE # mode targets. Otherwise if MODULE or CONFIG succeeded, then it prints the result to the # console via a non-QUIET find_package call and if CONFIG succeeded, creates ALIAS targets # with the MODULE IMPORTED names. if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(GTest QUIET) endif() if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest) option(BUILD_GTEST "Builds the googletest subproject" ON) option(BUILD_GMOCK "Builds the googlemock subproject" OFF) option(INSTALL_GTEST "Enable installation of googletest." OFF) if(EXISTS /usr/src/googletest AND NOT DEPENDENCIES_FORCE_DOWNLOAD) FetchContent_Declare( googletest SOURCE_DIR /usr/src/googletest ) else() message(STATUS "Google Test not found. Fetching...") FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG e2239ee6043f73722e7aa812a459f54a28552929 # release-1.11.0 ) endif() FetchContent_MakeAvailable(googletest) add_library(GTest::GTest ALIAS gtest) add_library(GTest::Main ALIAS gtest_main) else() find_package(GTest REQUIRED) if(TARGET GTest::gtest_main AND NOT TARGET GTest::Main) add_library(GTest::GTest ALIAS GTest::gtest) add_library(GTest::Main ALIAS GTest::gtest_main) endif() endif() endif(BUILD_TEST) if(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(benchmark CONFIG QUIET) endif() if(NOT TARGET benchmark::benchmark) message(STATUS "Google Benchmark not found. Fetching...") option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." OFF) option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark." OFF) FetchContent_Declare( googlebench GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.8.0 ) set(HAVE_STD_REGEX ON) set(RUN_HAVE_STD_REGEX 1) FetchContent_MakeAvailable(googlebench) if(NOT TARGET benchmark::benchmark) add_library(benchmark::benchmark ALIAS benchmark) endif() else() find_package(benchmark CONFIG REQUIRED) endif() endif(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(ROCM 0.11.0 CONFIG QUIET PATHS "${ROCM_ROOT}") # rocm-cmake endif() if(NOT ROCM_FOUND) message(STATUS "ROCm CMake not found. Fetching...") # We don't really want to consume the build and test targets of ROCm CMake. # CMake 3.18 allows omitting them, even though there's a CMakeLists.txt in source root. if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set(SOURCE_SUBDIR_ARG SOURCE_SUBDIR "DISABLE ADDING TO BUILD") else() set(SOURCE_SUBDIR_ARG) endif() set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") FetchContent_Declare( rocm-cmake GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git GIT_TAG rocm-6.1.2 ${SOURCE_SUBDIR_ARG} ) FetchContent_GetProperties(rocm-cmake) if(NOT rocm-cmake_POPULATED) # rocm-cmake 0.12.0 and higher needs to built from source FetchContent_Populate(rocm-cmake) message("Populated: ${rocm-cmake_SOURCE_DIR}") execute_process( WORKING_DIRECTORY ${rocm-cmake_SOURCE_DIR} COMMAND ${CMAKE_COMMAND} ${rocm-cmake_SOURCE_DIR} -DCMAKE_INSTALL_PREFIX=. ) execute_process( WORKING_DIRECTORY ${rocm-cmake_SOURCE_DIR} COMMAND ${CMAKE_COMMAND} --build ${rocm-cmake_SOURCE_DIR} --target install ) endif() FetchContent_MakeAvailable(rocm-cmake) find_package(ROCM CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}") else() find_package(ROCM 0.11.0 CONFIG REQUIRED PATHS "${ROCM_ROOT}") endif() # Restore user global state set(CMAKE_CXX_FLAGS ${USER_CXX_FLAGS}) if(DEFINED USER_BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ${USER_BUILD_SHARED_LIBS}) else() unset(BUILD_SHARED_LIBS CACHE ) endif() set(ROCM_WARN_TOOLCHAIN_VAR ${USER_ROCM_WARN_TOOLCHAIN_VAR} CACHE BOOL "") include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMHeaderWrapper) include(ROCMCheckTargetIds) include(ROCMClients) if(BUILD_DOCS) include(ROCMSphinxDoc) endif() rocPRIM-rocm-6.4.3/cmake/GenerateResourceSpec.cmake000077500000000000000000000066241502235215600221100ustar00rootroot00000000000000#!/usr/bin/cmake -P find_program(ROCMINFO_EXECUTABLE rocminfo ) if(NOT ROCMINFO_EXECUTABLE) message(FATAL_ERROR "rocminfo not found") endif() execute_process( COMMAND ${ROCMINFO_EXECUTABLE} RESULT_VARIABLE ROCMINFO_EXIT_CODE OUTPUT_VARIABLE ROCMINFO_STDOUT ERROR_VARIABLE ROCMINFO_STDERR ) if(ROCMINFO_EXIT_CODE) message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}") message(FATAL_ERROR ${ROCMINFO_STDERR}) endif() string(REGEX MATCHALL [[--(gfx[0-9a-f]+)]] ROCMINFO_MATCHES ${ROCMINFO_STDOUT} ) # NOTE: Unfortunately we don't have structs in CMake, # neither do we have std::partition only list(SORT) # # Transform raw regex matches to pairs of gfx IP and device id # This will be our struct emulation. In C++ it would be # # struct device # { # std::string ip; # int id; # }; # # std::vector GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} }; # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(), # [](const device& lhs, const device& rhs) # { # return std::lexicographical_compare(lhs.ip.begin(), lhs.ip.end(), # rhs.ip.begin(), rhs.ip.end()); # }); # set(GFXIP_AND_ID) set(ID 0) foreach(ROCMINFO_MATCH IN LISTS ROCMINFO_MATCHES) string(REGEX REPLACE "--" "" ROCMINFO_MATCH ${ROCMINFO_MATCH} ) list(APPEND GFXIP_AND_ID "${ROCMINFO_MATCH}:${ID}") math(EXPR ID "${ID} + 1") endforeach() list(SORT GFXIP_AND_ID) # Now comes the tricky part: implementing the following C++ logic # # std::stringstream JSON_PAYLOAD; # auto it = GFXIP_AND_ID.begin(); # while (it != GFXIP_AND_ID.end()) # { # auto IT = std::find_if(it, GFXIP_AND_ID.end(), # [=](const device& ip_id){ return ip_id.ip.compare(it->ip) != 0; }); # JSON_PAYLOAD << "\n \"" << it->ip << "\": ["; # std::for_each(it, IT, [&](const device& ip_id) # { # JSON_PAYLOAD << # "\n {\n" << # " \"id\": \"" << ip_id.id << "\"\n" << # " },"; # }); # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # JSON_PAYLOAD << "\n ],"; # it = IT; # } # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # set(JSON_PAYLOAD) set(IT1 0) list(GET GFXIP_AND_ID ${IT1} I1) string(REGEX REPLACE ":[0-9]+" "" IP1 ${I1}) list(LENGTH GFXIP_AND_ID COUNT) while(IT1 LESS COUNT) string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [") set(IT2 ${IT1}) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT) string(APPEND JSON_PAYLOAD "\n {\n" " \"id\": \"${ID2}\"\n" " }," ) math(EXPR IT2 "${IT2} + 1") if(IT2 LESS COUNT) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) endif() endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) string(APPEND JSON_PAYLOAD "\n ],") set(IT1 ${IT2}) set(IP1 ${IP2}) endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) set(JSON_HEAD [[{ "version": { "major": 1, "minor": 0 }, "local": [ {]] ) set(JSON_TAIL [[ } ] }]] ) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/resources.json ${JSON_HEAD} ${JSON_PAYLOAD} ${JSON_TAIL} )rocPRIM-rocm-6.4.3/cmake/Summary.cmake000066400000000000000000000055431502235215600174640ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(print_configuration_summary) message(STATUS "") message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") if(USE_HIPCXX) message(STATUS " HIP compiler : ${CMAKE_HIP_COMPILER}") message(STATUS " HIP compiler version : ${CMAKE_HIP_COMPILER_VERSION}") string(STRIP "${CMAKE_HIP_FLAGS}" CMAKE_HIP_FLAGS_STRIP) message(STATUS " HIP flags : ${CMAKE_HIP_FLAGS_STRIP}") else() message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") endif() get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if(GENERATOR_IS_MULTI_CONFIG) message(STATUS " Build types : ${CMAKE_CONFIGURATION_TYPES}") else() message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") endif() message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") if(USE_HIPCXX) message(STATUS " Device targets : ${CMAKE_HIP_ARCHITECTURES}") else() message(STATUS " Device targets : ${GPU_TARGETS}") endif() message(STATUS "") message(STATUS " ONLY_INSTALL : ${ONLY_INSTALL}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") message(STATUS " BUILD_NAIVE_BENCHMARK : ${BUILD_NAIVE_BENCHMARK}") message(STATUS " BUILD_EXAMPLE : ${BUILD_EXAMPLE}") message(STATUS " BUILD_DOCS : ${BUILD_DOCS}") endfunction() rocPRIM-rocm-6.4.3/cmake/VerifyCompiler.cmake000066400000000000000000000033061502235215600207610ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm /opt/rocm/llvm /opt/rocm /opt/rocm/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) if(NOT USE_HIPCXX) if(HIP_COMPILER STREQUAL "clang") if(NOT (HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") endif() else() message(FATAL_ERROR "HIP_COMPILER must be 'clang' (AMD ROCm platform)") endif() endif() rocPRIM-rocm-6.4.3/common/000077500000000000000000000000001502235215600152265ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/common/utils.hpp000066400000000000000000000102041502235215600170740ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_UTILS_HPP_ #define COMMON_UTILS_HPP_ #include #ifdef USE_GTEST // GoogleTest-compatible HIP_CHECK macro. FAIL is called to log the Google Test trace. // The lambda is invoked immediately as assertions that generate a fatal failure can // only be used in void-returning functions. #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess) \ { \ [error]() \ { FAIL() << "HIP error " << error << ": " << hipGetErrorString(error); }(); \ exit(error); \ } \ } #else #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess) \ { \ std::cout << "HIP error: " << hipGetErrorString(error) << " file: " << __FILE__ \ << " line: " << __LINE__ << std::endl; \ exit(error); \ } \ } #endif namespace common { inline char* __get_env(const char* name) { char* env; #ifdef _MSC_VER errno_t err = _dupenv_s(&env, nullptr, name); if(err) { return nullptr; } #else env = std::getenv(name); #endif return env; } inline void clean_env(char* env) { #ifdef _MSC_VER free(env); #endif (void)env; } inline bool use_hmm() { char* env = __get_env("ROCPRIM_USE_HMM"); const bool hmm = (env != nullptr) && (strcmp(env, "1") == 0); clean_env(env); return hmm; } // Helper for HMM allocations: HMM is requested through ROCPRIM_USE_HMM=1 environment variable template hipError_t hipMallocHelper(T** devPtr, size_t size) { if(use_hmm()) { return hipMallocManaged(reinterpret_cast(devPtr), size); } else { return hipMalloc(reinterpret_cast(devPtr), size); } return hipSuccess; } } // namespace common #endif // COMMON_UTILS_HPP_ rocPRIM-rocm-6.4.3/common/utils_device_ptr.hpp000066400000000000000000000535371502235215600213200ustar00rootroot00000000000000// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_UTILS_DEVICE_PTR_HPP #define ROCPRIM_UTILS_DEVICE_PTR_HPP #include "utils.hpp" #include #include #include #include namespace common { /// \brief An RAII friendly class to manage the memory allocated on device. /// /// \tparam A Template type used by the class. template class device_ptr { public: using decay_type = std::decay_t; using size_type = std::size_t; using value_type = ValueType; private: // If value_type is void we want to emulate allocating bytes (uchar). using value_type_proxy = std::conditional_t::value, unsigned char, ValueType>; public: static constexpr size_t value_size = sizeof(value_type_proxy); device_ptr() : device_raw_ptr_(nullptr), number_of_ele_(0){}; /// \brief Construct with a pre-allocated memory space. device_ptr(size_type pre_alloc_number_of_ele) : device_raw_ptr_(nullptr), number_of_ele_(pre_alloc_number_of_ele) { size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); }; device_ptr(device_ptr const&) = delete; device_ptr(device_ptr&& other) noexcept : device_raw_ptr_(other.device_raw_ptr_), number_of_ele_(other.number_of_ele_) { other.leak(); }; /// \brief Construct by host vectors with the same sized value_type template explicit device_ptr(std::vector const& data) : device_raw_ptr_(nullptr), number_of_ele_(data.size()) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpy(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice)); } template explicit device_ptr(std::vector const& data, hipStream_t stream) : device_raw_ptr_(nullptr), number_of_ele_(data.size()) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice, stream)); } template explicit device_ptr(std::array const& data) : device_raw_ptr_(nullptr), number_of_ele_(Size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = Size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpy(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice)); } template explicit device_ptr(std::array const& data, hipStream_t stream) : device_raw_ptr_(nullptr), number_of_ele_(Size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = Size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice, stream)); } template explicit device_ptr(std::unique_ptr const& uptr, size_type size) : device_raw_ptr_(nullptr), number_of_ele_(size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpy(device_raw_ptr_, uptr.get(), storage_size, hipMemcpyHostToDevice)); } template explicit device_ptr(std::unique_ptr const& uptr, size_type size, hipStream_t stream) : device_raw_ptr_(nullptr), number_of_ele_(size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, uptr.get(), storage_size, hipMemcpyHostToDevice, stream)); } ~device_ptr() { free_manually(); }; device_ptr& operator=(device_ptr const&) = delete; device_ptr& operator=(device_ptr&& other) noexcept { free_manually(); device_raw_ptr_ = other.device_raw_ptr_; number_of_ele_ = other.number_of_ele_; other.leak(); return *this; }; /// \brief Do copy on the device. /// /// \return A new `device_ptr` rvalue. device_ptr duplicate() const { device_ptr ret; ret.number_of_ele_ = number_of_ele_; size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&ret.device_raw_ptr_, storage_size)); HIP_CHECK( hipMemcpy(ret.device_raw_ptr_, device_raw_ptr_, storage_size, hipMemcpyDeviceToDevice)); return ret; } device_ptr duplicate_async(hipStream_t stream) const { device_ptr ret; ret.number_of_ele_ = number_of_ele_; size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&ret.device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(ret.device_raw_ptr_, device_raw_ptr_, storage_size, hipMemcpyDeviceToDevice, stream)); return ret; } /// \brief Do type cast and move the ownership to the new `device_ptr`. /// /// \return A new `device_ptr` rvalue. template device_ptr move_cast() noexcept { using target_value_t = typename device_ptr::value_type; auto ret_deivce_raw_ptr_ = static_cast(static_cast(device_raw_ptr_)); auto ret_number_of_ele_ = value_size * number_of_ele_ / sizeof(target_value_t); leak(); return {ret_deivce_raw_ptr_, ret_number_of_ele_}; } /// \brief Get the device raw pointer value_type* get() const noexcept { return device_raw_ptr_; } /// \brief Clean every thing on this instance, which could lead to memory leak. Should call `get()` and free the raw pointer manually void leak() noexcept { device_raw_ptr_ = nullptr; number_of_ele_ = 0; } /// \brief Call this function to garbage the memory in advance void free_manually() { if(device_raw_ptr_ != nullptr) { HIP_CHECK(hipFree(device_raw_ptr_)); } leak(); } void resize(size_type new_number_of_ele) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; HIP_CHECK(common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size)); HIP_CHECK(hipMemcpy(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } } void resize_async(size_type new_number_of_ele, hipStream_t stream) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; HIP_CHECK(common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size)); HIP_CHECK(hipMemcpyAsync(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice, stream)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } } // if got error hipErrorOutOfMemory` return false, else return `true` bool resize_with_memory_check(size_type new_number_of_ele) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; const auto err = common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size); if(err == hipErrorOutOfMemory) { return false; } HIP_CHECK(err); HIP_CHECK(hipMemcpy(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } return true; } bool resize_with_memory_check_async(size_type new_number_of_ele, hipStream_t stream) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; const auto err = common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size); if(err == hipErrorOutOfMemory) { return false; } HIP_CHECK(err); HIP_CHECK(hipMemcpyAsync(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice, stream)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } return true; } /// \brief Get the size of this memory space size_type msize() const noexcept { return number_of_ele_ * value_size; } /// \brief Get the number of elements size_type size() const noexcept { return number_of_ele_; } /// \brief Copy from host to device template void store(std::vector const& host_vec, size_type offset = 0) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); if(host_vec.size() + offset > number_of_ele_) { resize(host_vec.size() + offset); } HIP_CHECK(hipMemcpy(device_raw_ptr_ + offset, host_vec.data(), host_vec.size() * value_size, hipMemcpyHostToDevice)); } template void store(std::array const& host_arr) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); if(Size > number_of_ele_) { resize(Size); } HIP_CHECK( hipMemcpy(device_raw_ptr_, host_arr.data(), Size * value_size, hipMemcpyHostToDevice)); } template void store(std::unique_ptr const& uptr, size_type offset, size_type number_of_ele) { static_assert( sizeof(InValueType) == value_size, "value_type of input unique_ptr must have the same size with device_ptr::value_type"); if(offset + number_of_ele > number_of_ele_) { resize(offset + number_of_ele); } HIP_CHECK(hipMemcpy(device_raw_ptr_ + offset, uptr.get(), number_of_ele * value_size, hipMemcpyHostToDevice)); } template void store_async(std::vector const& host_vec, hipStream_t stream) { static_assert( sizeof(InValueType) == value_size, "value_type of input vector must have the same size with device_ptr::value_type"); if(host_vec.size() > number_of_ele_) { resize(host_vec.size()); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, host_vec.data(), host_vec.size() * value_size, hipMemcpyHostToDevice, stream)); } template void store_async(std::array const& host_arr, hipStream_t stream) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); if(Size > number_of_ele_) { resize(Size); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, host_arr.data(), Size * value_size, hipMemcpyHostToDevice, stream)); } template void store_async(std::unique_ptr const& uptr, size_type offset, size_type number_of_ele, hipStream_t stream) { static_assert( sizeof(InValueType) == value_size, "value_type of input unique_ptr must have the same size with device_ptr::value_type"); if(offset + number_of_ele > number_of_ele_) { resize(offset + number_of_ele); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_ + offset, uptr.get(), number_of_ele * value_size, hipMemcpyHostToDevice, stream)); } // will not check the boundary void store_value_at(size_type pos, value_type_proxy const& value) { HIP_CHECK(hipMemcpy(device_raw_ptr_ + pos, &value, value_size, hipMemcpyHostToDevice)); } // will not check the boundary template void store_value_at_async(size_type pos, value_type_proxy const& value, hipStream_t stream) { HIP_CHECK( hipMemcpy(device_raw_ptr_ + pos, &value, value_size, hipMemcpyHostToDevice, stream)); } /// \brief Copy from device to device template void replace(device_ptr const& device_ptr) { static_assert(sizeof(InPtrValueType) == value_size, "sizeof(InPtrValueType) must equal to value_size"); if(device_ptr.number_of_ele_ > number_of_ele_) { resize(device_ptr.number_of_ele_); } HIP_CHECK(hipMemcpy(device_raw_ptr_, device_ptr.device_raw_ptr_, device_ptr.number_of_ele_ * value_size, hipMemcpyDeviceToDevice)); } template void replace_async(device_ptr const& device_ptr, hipStream_t stream) { static_assert(sizeof(InPtrValueType) == value_size, "sizeof(InPtrValueType) must equal to value_size"); if(device_ptr.number_of_ele_ > number_of_ele_) { resize(device_ptr.number_of_ele_); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, device_ptr.device_raw_ptr_, device_ptr.number_of_ele_ * value_size, hipMemcpyDeviceToDevice, stream)); } void memset(size_type offset, int value, size_type size_bytes) { HIP_CHECK(hipMemset(reinterpret_cast(device_raw_ptr_) + offset, value, static_cast(size_bytes))); } void memset_async(size_type offset, int value, size_type size_bytes, hipStream_t stream) { HIP_CHECK(hipMemsetAsync(reinterpret_cast(device_raw_ptr_) + offset, value, static_cast(size_bytes), stream)); } /// \brief Copy from device to host /// This function will store loaded values into std::vector auto load() const { std::vector ret(number_of_ele_); HIP_CHECK(hipMemcpy(ret.data(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost)); return ret; } auto load_async(hipStream_t stream) const { std::vector ret(number_of_ele_); HIP_CHECK(hipMemcpyAsync(ret.data(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost, stream)); return ret; } template auto load_to_array() const { std::array ret; HIP_CHECK(hipMemcpy(ret.data(), device_raw_ptr_, std::min(number_of_ele_, Size) * value_size, hipMemcpyDeviceToHost)); return ret; } template auto load_to_array_async(hipStream_t stream) const { std::array ret; HIP_CHECK(hipMemcpyAsync(ret.data(), device_raw_ptr_, std::min(number_of_ele_, Size) * value_size, hipMemcpyDeviceToHost, stream)); return ret; } auto load_to_unique_ptr() const { std::unique_ptr ret(new value_type[number_of_ele_]); HIP_CHECK(hipMemcpy(ret.get(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost)); return ret; } auto load_to_unique_ptr_async(hipStream_t stream) const { std::unique_ptr ret(new value_type[number_of_ele_]); HIP_CHECK(hipMemcpyAsync(ret.get(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost, stream)); return ret; } auto load_value_at(size_type pos) const { value_type ret; HIP_CHECK(hipMemcpy(&ret, device_raw_ptr_ + pos, value_size, hipMemcpyDeviceToHost)); return ret; } auto load_value_at_async(size_type pos, hipStream_t stream) const { value_type ret; HIP_CHECK( hipMemcpyAsync(&ret, device_raw_ptr_ + pos, value_size, hipMemcpyDeviceToHost, stream)); return ret; } private: value_type* device_raw_ptr_; size_type number_of_ele_; }; } // namespace common #endif rocPRIM-rocm-6.4.3/conanfile.py000066400000000000000000000012051502235215600162440ustar00rootroot00000000000000# Copyright 2021 Advanced Micro Devices, Inc. # This conanfile is used to install development requirements, # e.g. # conan install -o clients=True -if build/deps . from conans import ConanFile, CMake class ConanPkgReqs(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake_find_package" options = { "shared": [True, False], "clients": [True, False], } default_options = { "shared": True, "clients": False, } def requirements(self): if self.options.clients: self.requires("gtest/1.11.0") self.requires("benchmark/1.5.2") rocPRIM-rocm-6.4.3/custom.properties000066400000000000000000000001361502235215600173660ustar00rootroot00000000000000booktitle=rocPRIM API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocPRIM-rocm-6.4.3/docs/000077500000000000000000000000001502235215600146665ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/docs/.gitignore000066400000000000000000000001171502235215600166550ustar00rootroot00000000000000/_build/ /_doxygen/ /doxygen/html /doxygen/xml /doxygen/*.tag /sphinx/_toc.yml rocPRIM-rocm-6.4.3/docs/CMakeLists.txt000066400000000000000000000025701502235215600174320ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # rocPRIM documentation include(GNUInstallDirs) rocm_add_sphinx_doc( "${CMAKE_CURRENT_SOURCE_DIR}" BUILDER html OUTPUT_DIR html USES_DOXYGEN ) install( DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html" DESTINATION ${CMAKE_INSTALL_DOCDIR} ) rocPRIM-rocm-6.4.3/docs/block_ops/000077500000000000000000000000001502235215600166415ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/docs/block_ops/data_mov_funcs.rst000066400000000000000000000047321502235215600223710ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _data_mov_funcs: ******************************************************************** Data movement functions ******************************************************************** Direct Blocked =============== Load ------ .. doxygenfunction:: rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) .. doxygenfunction:: rocprim::block_load_direct_blocked (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) Store ---------- .. doxygenfunction:: rocprim::block_store_direct_blocked (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_store_direct_blocked (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid) Direct Blocked Vectorized =========================== Load ------- .. doxygenfunction:: rocprim::block_load_direct_blocked_vectorized (unsigned int flat_id, T *block_input, U(&items)[ItemsPerThread]) Store ---------- .. doxygenfunction:: rocprim::block_store_direct_blocked_vectorized (unsigned int flat_id, T *block_output, U(&items)[ItemsPerThread]) Direct Striped ================== Load --------- .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid) .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) Store ---------- .. doxygenfunction:: rocprim::block_store_direct_striped (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_store_direct_striped (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid) Direct Warp Striped ==================== Load --------- .. doxygengroup:: blockmodule_warp_load_functions :content-only: Store ---------- .. doxygengroup:: blockmodule_warp_store_functions :content-only: rocPRIM-rocm-6.4.3/docs/block_ops/index.rst000066400000000000000000000011441502235215600205020ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _block-index: ******************************************************************** Block-Wide Operations ******************************************************************** * :ref:`class-index` * :ref:`blk-load` * :ref:`blk-store` * :ref:`blk-adjacent_difference` * :ref:`blk-discontinuity` * :ref:`blk-scan` * :ref:`blk-reduce` * :ref:`blk-shuffle` * :ref:`blk-exchange` * :ref:`blk-sort` * :ref:`blk-histogram` * :ref:`data_mov_funcs` rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/000077500000000000000000000000001502235215600211575ustar00rootroot00000000000000rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/adjacent_difference.rst000066400000000000000000000005721502235215600256400ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-adjacent_difference: ******************************************************************** Adjacent difference ******************************************************************** .. doxygenclass:: rocprim::block_adjacent_difference :members: rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/discontinuity.rst000066400000000000000000000005501502235215600246160ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-discontinuity: ******************************************************************** Discontinuity ******************************************************************** .. doxygenclass:: rocprim::block_discontinuity :members: rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/exchange.rst000066400000000000000000000005311502235215600234720ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-exchange: ******************************************************************** Exchange ******************************************************************** .. doxygenclass:: rocprim::block_exchange :members: rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/histogram.rst000066400000000000000000000006721502235215600237130ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-histogram: ******************************************************************** Histogram ******************************************************************** Class ========= .. doxygenclass:: rocprim::block_histogram :members: Algorithms =========== .. doxygenenum:: rocprim::block_histogram_algorithm rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/index.rst000066400000000000000000000010311502235215600230130ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _class-index: ******************************************************************** Operation classes ******************************************************************** * :ref:`blk-load` * :ref:`blk-store` * :ref:`blk-adjacent_difference` * :ref:`blk-discontinuity` * :ref:`blk-scan` * :ref:`blk-reduce` * :ref:`blk-shuffle` * :ref:`blk-exchange` * :ref:`blk-sort` * :ref:`blk-histogram` rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/load.rst000066400000000000000000000006471502235215600226370ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-load: ******************************************************************** Load ******************************************************************** Class ========== .. doxygenclass:: rocprim::block_load :members: Algorithms ============== .. doxygenenum:: rocprim::block_load_method rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/reduce.rst000066400000000000000000000006601502235215600231620ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-reduce: ******************************************************************** Reduce ******************************************************************** Class ========== .. doxygenclass:: rocprim::block_reduce :members: Algorithms ============ .. doxygenenum:: rocprim::block_reduce_algorithm rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/scan.rst000066400000000000000000000006471502235215600226440ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-scan: ******************************************************************** Scan ******************************************************************** Class ======= .. doxygenclass:: rocprim::block_scan :members: Algorithms ============== .. doxygenenum:: rocprim::block_scan_algorithm rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/shuffle.rst000066400000000000000000000005261502235215600233500ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-shuffle: ******************************************************************** Shuffle ******************************************************************** .. doxygenclass:: rocprim::block_shuffle :members: rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/sort.rst000066400000000000000000000007661502235215600227110ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-sort: ******************************************************************** Sort ******************************************************************** Generic Block Sort ================== .. doxygenclass:: rocprim::block_sort :members: .. doxygenenum:: rocprim::block_sort_algorithm Radix sort =========== .. doxygenclass:: rocprim::block_radix_sort :members: rocPRIM-rocm-6.4.3/docs/block_ops/ops_classes/store.rst000066400000000000000000000006441502235215600230510ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-store: ******************************************************************** Store ******************************************************************** Class ====== .. doxygenclass:: rocprim::block_store :members: Algorithms =========== .. doxygenenum:: rocprim::block_store_method rocPRIM-rocm-6.4.3/docs/classification-map.xml000066400000000000000000000206731502235215600211660ustar00rootroot00000000000000 Kanika Yadav (external) Microsoft Office User 2020-09-25T06:54:04Z 2021-12-22T19:07:50Z 16.00 true 2021-02-23T09:13:03Z Standard 90c2fedb-0da6-4717-8531-d16a1b9930f4 45597f60-6e37-4be7-acfb-4c9e23b261ea 0 true 2022-01-14T16:33:39Z Privileged AMD Official Use Only-AIP 2.0 3dd8961f-e488-4e60-8e11-a82d994e183d 3ab6c0f7-c658-4f6f-bd9d-6ef921551ff7 1 14235 32767 32767 32767 False False Filename Title Categories Version Doc Type MAP rocm;hip-sdk;hip;gpu;amd;prim;rocprim 4-5 apply-ALL default rocPRIM API Guide reference