pax_global_header00006660000000000000000000000064150153734130014514gustar00rootroot0000000000000052 comment=d790d3ed379830da01564df1762a6d7c94eee965 rocFFT-rocm-6.4.3/000077500000000000000000000000001501537341300136075ustar00rootroot00000000000000rocFFT-rocm-6.4.3/.azuredevops/000077500000000000000000000000001501537341300162345ustar00rootroot00000000000000rocFFT-rocm-6.4.3/.azuredevops/rocm-ci.yml000066400000000000000000000012401501537341300203050ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/rocFFT.yml@pipelines_repo rocFFT-rocm-6.4.3/.clang-format000066400000000000000000000065421501537341300161710ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: All ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- rocFFT-rocm-6.4.3/.githooks/000077500000000000000000000000001501537341300155145ustar00rootroot00000000000000rocFFT-rocm-6.4.3/.githooks/install000077500000000000000000000002221501537341300171040ustar00rootroot00000000000000#!/usr/bin/env bash cd $(git rev-parse --git-dir) cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" rocFFT-rocm-6.4.3/.githooks/pre-commit000077500000000000000000000017671501537341300175310ustar00rootroot00000000000000#!/bin/sh # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. base=/opt/rocm/llvm/bin/clang-format format="" # Redirect output to stderr. exec 1>&2 # check if clang-format is installed type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then echo "$base is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) if git rev-parse --verify HEAD >/dev/null 2>&1 then against=HEAD else # Initial commit: diff against an empty tree object against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then echo "$format $file" "$format" -i -style=file "$file" fi done rocFFT-rocm-6.4.3/.github/000077500000000000000000000000001501537341300151475ustar00rootroot00000000000000rocFFT-rocm-6.4.3/.github/CODEOWNERS000077500000000000000000000005761501537341300165550ustar00rootroot00000000000000* @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation # Header directory for Doxygen documentation library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts rocFFT-rocm-6.4.3/.github/CONTRIBUTING.md000066400000000000000000000146331501537341300174070ustar00rootroot00000000000000 # Contributing to rocFFT # We welcome contributions to rocFFT. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgment for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## When a contribution is submitted via a pull request, a number of automated checks are run in order to verify compilation correctness and prevent performance regressions. These checks include: * Building and testing the change on various OS platforms (Ubuntu, RHEL, etc.) * Running on different GPU architectures (MI-series, Radeon series cards, etc.) * Running benchmarks to check for performance degradation In order for a submission to be accepted: * It must pass all of the automated checks * It must undergo a code review Users can visualize our continuous integration infrastructure in: `rocFFT/.jenkins`. The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes before raising pull requests. ## Code Structure ## In a broad view, rocFFT library is structured as follows: ├── docs/: contains rocFFT documentation ├── library/: contains main source code and headers ├── clients/: │   ├── bench/ : contains benchmarking code │   ├── samples/ : contains examples │   ├── tests/ : contains our test infrastructure ├── shared/: contains important global headers and those for linking to other applications ## Coding Style ## * All public APIs are C89 compatible; all other library code should use c++17. * Our minimum supported compiler is clang 3.6. * Avoid CamelCase: rule applies specifically to publicly visible APIs, but is encouraged (not mandated) for internal code. * C and C++ code should be formatted using `clang-format`. You can use the clang-format version available in `rocFFT/.clang-format`. To format a C/C++ file, use: ``` clang-format -style=file -i ``` * Python code should use: ``` yapf --style pep8 ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. Note that a [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. The following guidelines apply: * When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch. * Note that releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers. * Ensure code builds successfully. * Do not break existing test cases * Code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. ### Deliverables ### New changes should include test coverage. Our testing infrastructure is located in `clients/tests/`, and can be used as a reference. The following guidelines apply: * New functionality will only be merged with new unit tests. * New unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). * Tests must have good code coverage. ### Process ### All pull requests must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Once a contribution is ready to be submitted, consider the following: * Before you create a PR, ensure that all files have been gone through the clang formatting: clang-format -i * While creating a PR, you can take a look at a `diff` of the changes you made using the PR's "Files" tab, and verify that no unintentional changes are being submitted. * Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. * During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. * When a modification request has been completed, the conversation thread about it will be marked as resolved. * To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. * Once your contribution is approved, we will use the *squash merge* option from GitHub to integrate it to the corresponding branch. ## Code License ## All code contributed to this project will be licensed under the license identified in the [LICENSE.md](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md). Your contribution will be accepted under the same license. rocFFT-rocm-6.4.3/.github/ISSUE_TEMPLATE.md000066400000000000000000000004611501537341300176550ustar00rootroot00000000000000### What is the expected behavior - ### What actually happens - ### How to reproduce - ### Environment | Hardware | description | |-----|-----| | GPU | device string | | CPU | device string | | Software | version | |-----|-----| | ROCK | v0.0 | | ROCR | v0.0 | | HCC | v0.0 | | Library | v0.0 | rocFFT-rocm-6.4.3/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000000701501537341300207450ustar00rootroot00000000000000resolves #___ Summary of proposed changes: - - - rocFFT-rocm-6.4.3/.github/dependabot.yml000066400000000000000000000012231501537341300177750ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" rocFFT-rocm-6.4.3/.gitignore000066400000000000000000000005541501537341300156030ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Visual Studio Code .vscode # install.sh build dir build/ # python bytecode __pycache__ rocFFT-rocm-6.4.3/.jenkins/000077500000000000000000000000001501537341300153265ustar00rootroot00000000000000rocFFT-rocm-6.4.3/.jenkins/application.groovy000066400000000000000000000247451501537341300211140ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'application') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocFFT', 'hipFFT'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } } def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} ${getDependenciesCommand} git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git cd Gromacs mkdir build_tmpi cd build_tmpi cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. make make install cd .. mkdir build_mpi cd build_mpi cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. make make install cd .. """ platform.runCommand(this, command) } def testCommand = { platform, project-> def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} cd Gromacs source gromacs-install/bin/GMXRC gmx --version export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib echo \$LD_LIBRARY_PATH git clone https://github.com/jychang48/benchmark-gromacs.git cd benchmark-gromacs export GMX_MAXBACKUP=-1 echo "* Threaded MPI ******************************************************************************************************" #ADH_DODEC cd adh_dodec tar zxf adh_dodec.tar.gz gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs # STMV cd .. cd stmv/ tar zxf stmv.tar.gz gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200 # 1 GPU gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400 # 4 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs # CELLULOSE_NVE cd .. cd cellulose_nve/ tar zxf cellulose_nve.tar.gz gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs echo "* MPI ***************************************************************************************************************" # ADH_DODEC cd .. cd adh_dodec/ tar zxf adh_dodec.tar.gz mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs # STMV cd .. cd stmv/ tar zxf stmv.tar.gz mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr # 1 GPU mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs # CELLULOSE_NVE cd .. cd cellulose_nve/ tar zxf cellulose_nve.tar.gz mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs """ platform.runCommand(this, command) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['8gfx90a']], urlJobName) } } } rocFFT-rocm-6.4.3/.jenkins/common.groovy000066400000000000000000000145451501537341300200760ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false, boolean buildMPI=false) { project.paths.construct_build_prefix() def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } } String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON' String warningArgs = '-DWERROR=ON' String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String buildMPIArgs = buildMPI ? '-DCMAKE_PREFIX_PATH=/usr/local/openmpi -DROCFFT_MPI_ENABLE=ON' : '' String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : '' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} ${getDependenciesCommand} set -e mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} ${buildMPIArgs} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../.. make -j\$(nproc) sudo make install """ platform.runCommand(this, command) } def runCompileClientCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON' String warningArgs = '-DWERROR=ON' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' String buildTypeArgClients = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' String cmakePrefixPathArg = "-DCMAKE_PREFIX_PATH=${project.paths.project_build_prefix}" def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/clients mkdir -p build && cd build ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArgClients} ${clientArgs} ${warningArgs} ${cmakePrefixPathArg} ${amdgpuTargets} ../ make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project, boolean debug=false, gfilter='', extraArgs='') { String testBinaryName = 'rocfft-test' String directory = debug ? 'debug' : 'release' String gfilterArg = '' if (gfilter) { gfilterArg = "--gtest_filter=${gfilter}" } def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} --gtest_color=yes --R 80 --nrand 10 ${extraArgs} """ platform.runCommand(this, command) } def runPackageCommand(platform, project, jobName, boolean debug=false) { String directory = debug ? 'debug' : 'release' def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false) platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) //trim temp files def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${directory}/ rm -rf _CPack_Packages/ find -name '*.o' -delete """ platform.runCommand(this, command) } def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble) { project.paths.construct_build_prefix() // Don't build clients, since we're just testing if the library can build String clientArgs = '' String warningArgs = '-DWERROR=ON' String buildTypeArg = '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = 'release' String genPatternArgs = "-DGENERATOR_PATTERN=${genPattern}" String manualSmallArgs = (genSmall != null) ? "-DGENERATOR_MANUAL_SMALL_SIZE=${genSmall}" : '' String manualLargeArgs = (genLarge != null) ? "-DGENERATOR_MANUAL_LARGE_SIZE=${genLarge}" : '' String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : '' String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}" String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} rm -rf build/${buildTypeDir} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../.. make -j\$(nproc) """ platform.runCommand(this, command) } return this rocFFT-rocm-6.4.3/.jenkins/debug.groovy000066400000000000000000000046401501537341300176670ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'Debug') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, true) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project, true) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName, true) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['any']], urlJobName) } } } rocFFT-rocm-6.4.3/.jenkins/extended.groovy000066400000000000000000000070201501537341300203740ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runBitwiseReproTest (platform, project, boolean debug=false, gfilter='', reprodb='', int repeat=1) { String testBinaryName = 'rocfft-test' String directory = debug ? 'debug' : 'release' String gfilterArg = '' if (gfilter) { gfilterArg = "--gtest_filter=${gfilter}" } String reproDbArg = '' if (reprodb) { reproDbArg = "--repro-db=${reprodb}" } String repeatArg = '' if (repeat > 1) { repeatArg = "--gtest_repeat=${repeat}" } def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} ${reproDbArg} ${repeatArg} --gtest_color=yes --R 80 --nrand 10 """ platform.runCommand(this, command) } def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'Extended') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) commonGroovy.runCompileClientCommand(platform, project, jobName, false) } def testCommand = { platform, project-> runBitwiseReproTest(platform, project, false, "*pow2_1D/bitwise_repro_test*", 'bitwise_repro.db', 2) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocFFT-rocm-6.4.3/.jenkins/multigpu.groovy000066400000000000000000000055411501537341300204500ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'multigpu') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" // build with MPI enabled commonGroovy.runCompileCommand(platform, project, jobName, false, false, true) commonGroovy.runCompileClientCommand(platform, project, jobName, false) } def testCommand = { platform, project-> //run single-process multi-GPU tests commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*") // run MPI tests across 4 ranks commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*", '--mp_lib mpi --mp_ranks 4 --mp_launch "/usr/local/openmpi/bin/mpirun --np 4 ./rocfft_mpi_worker"') } def packageCommand = { platform, project-> // don't package anything - we're not distributing MPI-enabled rocFFT so we don't want to expose any MPI-enabled packages anywhere that other builds can mistakenly pick up } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["main":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["main":([ubuntu20:['8gfx90a']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu20:['8gfx90a']], urlJobName) } } } rocFFT-rocm-6.4.3/.jenkins/performance.groovy000066400000000000000000000214171501537341300211030ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false) { def reference = (env.BRANCH_NAME ==~ /PR-\d+/) ? 'develop' : 'master' project.paths.construct_build_prefix() def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } } dir("${project.paths.project_build_prefix}/ref-repo") { git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git' } String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON' String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_BENCH=OFF' String warningArgs = '-DWERROR=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} ${getDependenciesCommand} set -e mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../.. make -j\$(nproc) popd cd ref-repo mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../.. make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project, boolean debug=false) { String sudo = auxiliary.sudo(platform.jenkinsLabel) String directory = debug ? 'debug' : 'release' def dataTypes = ['single', 'double'] for (def dataType in dataTypes) { def command = """#!/usr/bin/env bash set -ex pwd cd ${project.paths.project_build_prefix} export ROCFFT_RTC_CACHE_PATH="\$JENKINS_HOME_LOCAL/rocfft_build_cache.db" ./scripts/perf/rocfft-perf run --bench ./build/${directory}/clients/staging/dyna-rocfft-bench --lib ./ref-repo/build/${directory}/library/src/librocfft.so --lib ./build/${directory}/library/src/librocfft.so --out ./${dataType}_ref --out ./${dataType}_change --device 0 --precision ${dataType} --suite benchmarks ls ${dataType}_change ls ${dataType}_ref mkdir ${dataType}_results ./scripts/perf/rocfft-perf post ./${dataType}_results ./${dataType}_ref ./${dataType}_change ls ${dataType}_change/*.mdat ./scripts/perf/rocfft-perf html ./${dataType}_results ./${dataType}_ref ./${dataType}_change mv ${dataType}_results/figs.html ${dataType}_results/figs_${platform.gpu}.html """ platform.runCommand(this, command) archiveArtifacts "${project.paths.project_build_prefix}/${dataType}_results/*.html" publishHTML([allowMissing: false, alwaysLinkToLastBuild: false, keepAll: false, reportDir: "${project.paths.project_build_prefix}/${dataType}_results", reportFiles: "figs_${platform.gpu}.html", reportName: "${dataType}-precision-${platform.gpu}", reportTitles: "${dataType}-precision-${platform.gpu}"]) } withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')]) { platform.runCommand( this, """ cd ${project.paths.build_prefix} git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.2.0 cd rocPTS python3 -m pip install build python3 -m build python3 -m pip install . """ ) } writeFile( file: project.paths.project_build_prefix + "/record_pts.py", text: libraryResource("com/amd/scripts/record_pts.py")) def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : "" def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} ${setupBranch} git checkout \$BRANCH_NAME benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d) mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref cp -uf ./*_change/* \${benchmark_folder}/all_change cp -uf ./*_ref/* \${benchmark_folder}/all_ref python3 ./record_pts.py \ --dataset-path \$PWD/\${benchmark_folder} \ --reference-dataset all_ref \ --new-dataset all_change \ --new-build . \ --reference-build ./ref-repo\ -v 5.5 \ -l pts_rocfft_benchmark_data-v1.0.0 """ withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')]) { platform.runCommand(this, command) } } def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'Performance') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def gpus = [] def dataTypes = ['single', 'double'] def compileCommand = { platform, project-> gpus.add(platform.gpu) commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> runTestCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) def commentString = "Performance reports: \n" + "Commit hashes: \n" for (parentHash in prj.gitParentHashes) { commentString += "${parentHash} \n" } for (gpu in gpus) { for (dataType in dataTypes) { commentString += "[${gpu} ${dataType} report](${JOB_URL}/${dataType}-precision-${gpu})\n" } } if (env.BRANCH_NAME ==~ /PR-\d+/) { boolean commentExists = false for (prComment in pullRequest.comments) { if (prComment.body.contains("Performance reports:")) { commentExists = true prComment.body = commentString } } if (!commentExists) { def comment = pullRequest.comment(commentString) } } } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900','gfx906']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocFFT-rocm-6.4.3/.jenkins/staticanalysis.groovy000066400000000000000000000072361501537341300216400ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() def yapfCommand = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} yapf --version find . -iname '*.py' \ | grep -v 'build/' \ | xargs -n 1 -P 1 -I{} -t sh -c 'yapf --style pep8 {} | diff - {}' """ platform.runCommand(this, yapfCommand) } def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'StaticAnalysis') prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true boolean staticAnalysis = true def compileCommand = { platform, project-> runCompileCommand(platform, project, jobName, false) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) def kernelSubsetPrj = new rocProject('rocFFT-internal', 'BuildKernelSubset') def nodesForPrj2 = new dockerNodes(nodeDetails, jobName, kernelSubsetPrj) def commonGroovy def compileSubsetCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" // build pattern pow2,pow7 no manual small and large, dp only commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'pow2,pow7', null, null, true) // build large sizes, dp only commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'large', null, null, true) // build 2D sizes, dp only commonGroovy.runSubsetBuildCommand(platform, project, jobName, '2D', null, null, true) // put an extra unsupported size(10) in manual large to see if it will be filtered correctly commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', null, '10,50,100,200,336', true) // put an extra unsupported size(23) in manual small to see if it will be filtered correctly commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23,1024', '10,50,100,200,336', true) // all the manual sizes are not supported //commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23', '10', true) } buildProject(kernelSubsetPrj , formatCheck, nodesForPrj2.dockerArray, compileSubsetCommand, null, null) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } } rocFFT-rocm-6.4.3/.jenkins/staticlibrary.groovy000066400000000000000000000046541501537341300214620ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'StaticLibrary') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, false, true) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } rocFFT-rocm-6.4.3/.readthedocs.yaml000066400000000000000000000005721501537341300170420ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "mambaforge-22.9" conda: environment: docs/environment.yml rocFFT-rocm-6.4.3/CHANGELOG.md000066400000000000000000000472601501537341300154310ustar00rootroot00000000000000# Changelog for rocFFT Documentation for rocFFT is available at [https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/). ## rocFFT 1.0.32 for ROCm 6.4.0 ### Changed * Building with the address sanitizer option sets xnack+ on relevant GPU architectures and adds address-sanitizer support to runtime-compiled kernels. * The `AMDGPU_TARGETS` build variable should be replaced with `GPU_TARGETS`. `AMDGPU_TARGETS` is deprecated. ### Removed * Removed ahead-of-time compiled kernels for the gfx906, gfx940, and gfx941 architectures. These architectures still function the same, but kernels for them are now compiled at runtime. * Removed consumer GPU architectures from the precompiled kernel cache that ships with rocFFT. rocFFT continues to ship with a cache of precompiled RTC kernels for data-center and workstation architectures. As before, user-level caches can be enabled by setting the environment variable ROCFFT_RTC_CACHE_PATH to a writeable file location. ### Optimized * Improved MPI transform performance by using all-to-all communication for global transpose operations. Point-to-point communications are still used when all-to-all is not possible. * Improved the performance of unit-strided, complex interleaved, forward and inverse, length (64,64,64) FFTs. ### Resolved issues * Fixed incorrect results from 2-kernel 3D FFT plans that used non-default output strides. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/507). * Plan descriptions can be reused with different strides for different plans. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/504). * Fixed client packages to depend on hipRAND instead of rocRAND. * Fixed potential integer overflows during large MPI transforms. ## rocFFT 1.0.31 for ROCm 6.3.0 ### Added * rocfft-test now includes a --smoketest option. * Support for the gfx1151, gfx1200, and gfx1201 architectures. * Implemented experimental APIs to allow computing FFTs on data distributed across multiple MPI ranks. These APIs can be enabled with the `ROCFFT_MPI_ENABLE` CMake option. This option defaults to `OFF`. When `ROCFFT_MPI_ENABLE` is set to `ON`: * `rocfft_plan_description_set_comm` can be called to provide an MPI communicator to a plan description, which can then be passed to `rocfft_plan_create`. Each rank calls `rocfft_field_add_brick` to specify the layout of data bricks on that rank. * An MPI library with ROCm acceleration enabled is required at build time and at runtime. ### Changed * Compilation uses amdclang++ instead of hipcc. * CLI11 replaces Boost Program Options as the command line parser for clients and samples. ## rocFFT 1.0.30 for ROCm 6.2.4 ### Optimizations * Implemented 1D kernels for factorizable sizes > 1024 and < 2048. ### Fixes * Fixed plan creation failure on some even-length real-complex transforms that use Bluestein's algorithm. ### Additions * GFX1151 Support ## rocFFT 1.0.29 for ROCm 6.2.1 ### Optimizations * Implemented 1D kernels for factorizable sizes < 1024 ## rocFFT 1.0.28 for ROCm 6.2.0 ### Optimizations * Implemented multi-device transform for 3D pencil decomposition. Contiguous dimensions on input and output bricks are transformed locally, with global transposes to make remaining dimensions contiguous. ### Changes * Add option in dyna-bench to load the libs in forward and then reverse order for benchmark tests. * Randomly generated accuracy tests are now disabled by default; these can be enabled using the --nrand option (which defaults to 0). * Use Bonferroni multi-hypothesis testing framework by default for benchmark tests. ## rocFFT 1.0.27 for ROCm 6.1.1 ### Fixes * Fixed kernel launch failure on execute of very large odd-length real-complex transforms. ### Additions * Enable multi-gpu testing on systems without direct GPU-interconnects ## rocFFT 1.0.26 for ROCm 6.1.0 ### Changes * Multi-device FFTs now allow batch greater than 1 * Multi-device, real-complex FFTs are now supported * rocFFT now statically links libstdc++ when only `std::experimental::filesystem` is available (to guard against ABI incompatibilities with newer libstdc++ libraries that include `std::filesystem`) ## rocFFT 1.0.25 for ROCm 6.0.0 ### Additions * Implemented experimental APIs to allow computing FFTs on data distributed across multiple devices in a single process * `rocfft_field` is a new type that can be added to a plan description to describe the layout of FFT input or output * `rocfft_field_add_brick` can be called to describe the brick decomposition of an FFT field, where each brick can be assigned a different device These interfaces are still experimental and subject to change. We are interested in getting feedback. You can raise questions and concerns by opening issues in the [rocFFT issue tracker](https://github.com/ROCmSoftwarePlatform/rocFFT/issues). Note that multi-device FFTs currently have several limitations (we plan to address these in future releases): * Real-complex (forward or inverse) FFTs are not supported * Planar format fields are not supported * Batch (the `number_of_transforms` provided to `rocfft_plan_create`) must be 1 * FFT input is gathered to the current device at run time, so all FFT data must fit on that device ### Optimizations * Improved the performance of several 2D/3D real FFTs supported by `2D_SINGLE` kernel. Offline tuning provides more optimization for fx90a * Removed an extra kernel launch from even-length, real-complex FFTs that use callbacks ### Changes * Built kernels in a solution map to the library kernel cache * Real forward transforms (real-to-complex) no longer overwrite input; rocFFT may still overwrite real inverse (complex-to-real) input, as this allows for faster performance * `rocfft-rider` and `dyna-rocfft-rider` have been renamed to `rocfft-bench` and `dyna-rocfft-bench`; these are controlled by the `BUILD_CLIENTS_BENCH` CMake option * Links for the former file names are installed, and the former `BUILD_CLIENTS_RIDER` CMake option is accepted for compatibility, but both will be removed in a future release * Binaries in debug builds no longer have a `-d` suffix ### Fixes * rocFFT now correctly handles load callbacks that convert data from a smaller data type (e.g., 16-bit integers -> 32-bit float) ## rocFFT 1.0.24 for ROCm 5.7.0 ### Optimizations * Improved the performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use Bluestein's algorithm ### Additions * Implemented a solution map version converter and finished the first conversion from ver.0 to ver.1 * Version 1 removes some incorrect kernels (sbrc/sbcr using `half_lds`) ### Changes * Moved `rocfft_rtc_helper` executable to the `lib/rocFFT` directory on Linux * Moved library kernel cache to the `lib/rocFFT` directory ## rocFFT 1.0.23 for ROCm 5.6.0 ### Additions * Implemented half-precision transforms; these can be requested by passing `rocfft_precision_half` to `rocfft_plan_create` * Implemented a hierarchical solution map that saves information on how to decompose a problem and the kernels that are used * Implemented a first version of offline-tuner to support tuning kernels for C2C and Z2Z problems ### Changes * Replaced `std::complex` with hipComplex data types for the data generator * FFT plan dimensions are now sorted to be row-major internally where possible, which produces better plans if the dimensions were accidentally specified in a different order (column-major, for example) * Added the `--precision` argument to benchmark and test clients (`--double` is still accepted but is deprecated as a method to request a double-precision transform) * Improved performance test suite statistical framework ### Fixes * Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch failure ## rocFFT 1.0.22 for ROCm 5.5.0 ### Optimizations * Improved the performance of 1D lengths < 2048 that use Bluestein's algorithm * Reduced code generation time during plan creation * Optimized 3D R2C and C2R lengths 32, 84, 128 * Optimized batched small 1D R2C and C2R cases ### Additions * Added gfx1101 to default `AMDGPU_TARGETS` ### Changes * Moved client programs to C++17 * Moved planar kernels and infrequently used Stockham kernels to be runtime-compiled * Moved transpose, real-complex, Bluestein, and Stockham kernels to the library kernel cache ### Fixes * Removed zero-length twiddle table allocations, which fixes errors from `hipMallocManaged` * Fixed incorrect freeing of HIP stream handles during twiddle computation when multiple devices are present ## rocFFT 1.0.21 for ROCm 5.4.3 ### Fixes * Removed the source directory from `rocm_install_targets` to prevent the installation of `rocfft.h` in an unintended location ## rocFFT 1.0.20 for ROCm 5.4.1 ### Fixes * Fixed incorrect results on strided large 1D FFTs where batch size does not equal the stride ## rocFFT 1.0.19 for ROCm 5.4.0 ### Optimizations * Optimized some strided large 1D plans ### Additions * Added the `rocfft_plan_description_set_scale_factor` API to efficiently multiply each output element of an FFT by a given scaling factor * Created a `rocfft_kernel_cache.db` file next to the installed library; SBCC, CR, and RC kernels are moved to this file when built with the library, and are runtime-compiled for new GPU architectures * Added gfx1100 and gfx1102 to default `AMDGPU_TARGETS` ### Changes * Moved the runtime compilation cache to in-memory by default * A default on-disk cache can encounter contention problems on multi-node clusters with a shared filesystem * rocFFT can still use an on-disk cache by setting the `ROCFFT_RTC_CACHE_PATH` environment variable ## rocFFT 1.0.18 for ROCm 5.3.0 ### Changes * The runtime compilation cache now looks for environment variables `XDG_CACHE_HOME` (on Linux) and `LOCALAPPDATA` (on Windows) before falling back to `HOME` * Moved computation of the twiddle table from the host to the device ### Optimizations * Optimized 2D R2C and C2R to use 2-kernel plans where possible * Improved performance of the Bluestein algorithm * Optimized sbcc-168 and 100 by using half-LDS * Optimized length-280 2D and 3D transforms * Added kernels for factorizable 1D lengths < 128 ### Fixes * Fixed occasional failures to parallelize runtime compilation of kernels (failures would be retried serially and ultimately succeed, but this would take extra time) * Fixed failures of some R2C 3D transforms that use the unsupported `TILE_UNALGNED` SBRC kernels (an example is 98^3 R2C out-of-place) * Fixed bugs in the `SBRC_ERC` type ## rocFFT 1.0.17 for ROCm 5.2.0 ### Additions * Packages for test and benchmark executables on all supported operating systems using CPack * Added file and folder reorganization changes, with backward compatibility support, using `rocm-cmake` wrapper functions ### Changes * Improved reuse of twiddle memory between plans * Set a default load/store callback when only one callback type is set via the API (for improved performance) * Updated the GoogleTest dependency to version 1.11 ### Optimizations * Introduced a new access pattern of LDS (non-linear) and applied it on sbcc kernels len 64 and 81 for a performance improvement * Applied `lds-non-linear`, `direct-load-to-register`, and `direct-store-from-register` on sbcr kernels for a performance improvement ### Fixes * Correctness of certain transforms with unusual strides * Incorrect handling of user-specified stream for runtime-compiled kernels * Incorrect buffer allocation in `rocfft-test` on in-place transforms with different input and output sizes ## rocFFT 1.0.16 for ROCm 5.1.0 ### Changes * Supported unaligned tile dimension for `SBRC_2D` kernels * Improved test and benchmark infrastructure by adding RAII * Enabled runtime compilation of length-2304 FFT kernel during plan creation * Added tokenizer for test suite * Reduce twiddle memory requirements for even-length, real-complex transforms * Clients can now be built separately from the main library ### Optimizations * Optimized more large 1D cases by using `L1D_CC` plan * Optimized the 3D 200^3 C2R case * Optimized the 1D 2^30 double precision on MI200 * Added padding to work buffer sizes to improve performance in many cases ### Fixes * Fixed the correctness of some R2C transforms with unusual strides ### Removals * The hipFFT API (header) has been removed; use the [hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT) package or repository to obtain the API ## rocFFT 1.0.15 for ROCm 5.0.0 ### Changes * Enabled runtime compilation of single FFT kernels > length 1024 * Re-aligned the split device library into four roughly equal libraries * Implemented the FuseShim framework to replace the original OptimizePlan * Implemented the generic buffer-assignment framework * The buffer assignment is no longer performed by each node--we designed a generic algorithm to test and pick the best assignment path * With the help of FuseShim, we can achieve the most kernel-fusions possible * Don't read the imaginary part of the DC and Nyquist modes for even-length complex-to-real transforms ### Optimizations * Optimized twiddle conjugation; complex-to-complex inverse transforms should now have similar performance to forward transforms * Improved performance of single-kernel, small 2D transforms ## rocFFT 1.0.14 for ROCm 4.5.0 ### Optimizations * Optimized SBCC kernels of lengths 52, 60, 72, 80, 84, 96, 104, 108, 112, 160, 168, 208, 216, 224, and 240 with a new kernel generator ### Additions * Added support for Windows 10 as a build target ### Changes * Packaging has been split into a runtime package (`rocfft`) and a development package (`rocfft-devel`): The development package depends on the runtime package. When installing the runtime package, the package manager will suggest the installation of the development package to aid users transitioning from the previous version's combined package. This suggestion by package manager is for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` feature in the runtime package is introduced as a deprecated feature and will be removed in a future ROCm release. ### Fixes * Fixed validation failures for even-length R2C inplace 2D and 3D cubics sizes, such as 100^2 (or ^3), 200^2 (or ^3), and 256^2 (or ^3) * We combine two kernels (`r2c-transpose`) instead of combining the three kernels (`stockham-r2c-transpose`) ### Changes * Split 2D device code into separate libraries ## rocFFT 1.0.13 for ROCm 4.4.0 ### Optimizations * Improved plans by removing unnecessary transpose steps * Optimized scheme selection for 3D problems * Imposed fewer restrictions on `3D_BLOCK_RC` selection (more problems can use `3D_BLOCK_RC` and have performance gains) * Enabled `3D_RC`; some 3D problems with SBCC-supported z-dim can use fewer kernels to get benefits * Forced `--length` 336 336 56 (dp) to use faster `3D_RC` to prevent it from being skipped by a conservative threshold test * Optimized some even-length R2C/C2R cases by doing more in-place operations and combining pre- and post-processing into Stockham kernels * Added radix-17 ### Additions * Added a new kernel generator for select fused 2D transforms ### Fixes * Improved large 1D transform decompositions ## rocFFT 1.0.12 for ROCm 4.3.0 ### Changes * Re-split device code into single-precision, double-precision, and miscellaneous kernels ### Fixes * Fixed potential crashes in double-precision planar->planar transpose * Fixed potential crashes in 3D transforms with unusual strides for SBCC-optimized sizes * Improved buffer placement logic ### Additions * Added a new kernel generator for select lengths; new kernels have improved performance * Added public `rocfft_execution_info_set_load_callback` and`rocfft_execution_info_set_store_callback` API functions to allow running extra logic when loading data from and storing data to global memory during a transform ### Removals * Removed R2C pair schemes and kernels ### Optimizations * Optimized 2D and 3D R2C 100 and 1D Z2Z 2500 * Reduced number of kernels for 2D/3D sizes where higher dimension is 64, 128, 256 ### Fixes * Fixed potential crashes in 3D transforms with unusual strides, for SBCC-optimized sizes ## rocFFT 1.0.11 for ROCm 4.2.0 ### Changes * Move device code into the main library ### Optimizations * Improved performance for single-precision kernels exercising all except radix-2/7 butterfly ops * Minor optimization for C2R 3D 100 and 200 cube sizes * Optimized some C2C and R2C 3D 64, 81, 100, 128, 200, and 256 rectangular sizes * When factoring, test to see if the remaining length is explicitly supported * Explicitly added radix-7 lengths 14, 21, and 224 to list of supported lengths * Optimized R2C 2D and 3D 128, 200, and 256 cube sizes ### Known issues * Fixed potential crashes in small 3D transforms with unusual strides ([issue 311](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/311)) * Fixed potential crashes when running transforms on multiple devices ([issue 310](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/310)) ## rocFFT 1.0.10 for ROCm 4.1.0 ### Additions * Explicitly specify `MAX_THREADS_PER_BLOCK` through `__launch_bounds_` for all kernels * Switched to a new syntax for specifying AMD GPU architecture names and features ### Optimizations * Optimized C2C and R2C 3D 64, 81, 100, 128, 200, and 256 cube sizes * Improved the performance of the standalone out-of-place transpose kernel * Optimized the 1D length 40000 C2C case * Enabled radix-7 for size 336 * New radix-11 and radix-13 kernels; used in length 11 and 13 (and some of their multiples) transforms ### Changes * rocFFT now automatically allocates a work buffer if the plan requires one and none is provided * An explicit `rocfft_status_invalid_work_buffer` error is now returned when a work buffer of insufficient size is provided * Updated online documentation * Updated Debian package name version with separated underscore ( _ ) * Adjusted accuracy test tolerances and how they are compared ### Fixes * Fixed a 4x4x8192 accuracy failure ## rocFFT 1.0.8 for ROCm 3.10.0 ### Optimizations * Optimized the 1D length 10000 C2C case ### Changes * Added the `BUILD_CLIENTS_ALL` CMake option ### Fixes * Fixed the correctness of SBCC and SBRC kernels with non-unit strides * Fixed fused C2R kernel when a Bluestein transform follows it ## rocFFT 1.0.7 for ROCm 3.9.0 ### Optimizations * New R2C and C2R fused kernels to combine pre- and post-processing steps with transpose * Enabled diagonal transpose for 1D and 2D power-of-2 cases * New single kernels for small power-of-2, 3, and 5 sizes * Added more radix-7 kernels ### Changes * Explicitly disabled XNACK and SRAM-ECC features on AMDGPU hardware ### Fixes * Fixed 2D C2R transform with length 1 on one dimension * Fixed a potential thread unsafety in logging ## rocFFT 1.0.6 for ROCm 3.8.0 ### Optimizations * Improved the performance of 1D batch-paired R2C transforms of odd length * Added some radix-7 kernels * Improved the performance for 1D length 6561 and 10000 * Improved the performance for certain 2D transform sizes ### Changes * Allowed a static library build with `BUILD_SHARED_LIBS=OFF` CMake option * Updated GoogleTest dependency to version 1.10 ### Fixes * Correctness of certain large 2D sizes ## rocFFT 1.0.5 for ROCM 3.7.0 ### Optimizations * Optimized C2C power-of-2 middle sizes ### Changes * Parallelized work in unit tests and eliminated duplicate cases ### Fixes * Correctness of certain large 1D, and 2D power-of-3 and 5 sizes * Incorrect buffer assignment for some even-length R2C transforms * `` inclusion on C compilers * Incorrect results on non-unit strides with SBCC/SBRC kernels rocFFT-rocm-6.4.3/CMakeLists.txt000066400000000000000000000261071501537341300163550ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_BUILD_SCOPE ON ) project( rocfft LANGUAGES CXX C ) # This finds the rocm-cmake project, and installs it if not found # rocm-cmake contains common cmake code for rocm projects to help setup and install set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern ) find_package( ROCM 0.7.3 CONFIG QUIET PATHS ${ROCM_PATH} /opt/rocm ) if( NOT ROCM_FOUND ) set( rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download" ) file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log) list(GET status 0 status_code) list(GET status 1 status_string) if(NOT status_code EQUAL 0) message(FATAL_ERROR "error: downloading 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed status_code: ${status_code} status_string: ${status_string} log: ${log} ") endif() message(STATUS "downloading... done") execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzvf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} ) execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif( ) include( ROCMSetupVersion ) include( ROCMCreatePackage ) include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) include( ROCMCheckTargetIds ) include( ROCMClients ) include( ROCMHeaderWrapper ) # Using standardized versioning from rocm-cmake set ( VERSION_STRING "1.0.32" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for # convenience). # Users may override HIP path by specifying their own in CMAKE_MODULE_PATH list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) # Enable verbose output option( BUILD_VERBOSE "Output additional build information" OFF ) # BUILD_SHARED_LIBS is a cmake built-in; we make it an explicit option # such that it shows in cmake-gui option( BUILD_SHARED_LIBS "Build rocFFT as a shared library" ON ) option( WERROR "Treat warnings as errors" OFF ) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF) # Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable, # Set default to OFF since users are not likely to tune option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF) # Provide ability to disable hipRAND dependency option(USE_HIPRAND "Use hipRAND to provide device-side input generation" ON) if( USE_HIPRAND ) add_compile_definitions(USE_HIPRAND) endif( ) # Split up function pool compilation across N files to parallelize its build set(ROCFFT_FUNCTION_POOL_N 8 CACHE STRING "Number of files to split function_pool into for compilation") # FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( ${CMAKE_SOURCE_DIR}/library/include PATTERNS "*.h" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} ) endif() set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare ) if( WERROR ) set( WARNING_FLAGS ${WARNING_FLAGS} -Werror ) endif( ) set(DEFAULT_GPUS gfx803 gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1151 gfx1200 gfx1201) if(BUILD_ADDRESS_SANITIZER) add_compile_options(-fsanitize=address) add_link_options(-fsanitize=address) add_link_options(-shared-libasan) SET(DEFAULT_GPUS gfx908:xnack+ gfx90a:xnack+ gfx940:xnack+ gfx941:xnack+ gfx942:xnack+) add_link_options(-fuse-ld=lld) set(ROCFFT_KERNEL_CACHE_ENABLE off) add_compile_definitions(ADDRESS_SANITIZER) endif() # Build only for local GPU architecture if (BUILD_LOCAL_GPU_TARGET_ONLY) message(STATUS "Building only for local GPU target") if (COMMAND rocm_local_targets) rocm_local_targets(DEFAULT_GPUS) else() message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.") endif() endif() if(AMDGPU_TARGETS AND NOT GPU_TARGETS) message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." ) endif() set(AMDGPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)") rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}") # Don't force, users should be able to override GPU_TARGETS at the command line if desired set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for") # HIP is required - library and clients use HIP to access the device find_package( HIP REQUIRED CONFIG ) # The nvidia backend can be used to compile for CUDA devices. # Specify the CUDA prefix in the CUDA_PREFIX variable. # CUDA_ARCH (e.g. sm_75) is also required. if( USE_CUDA ) if( NOT DEFINED CUDA_PREFIX ) message( FATAL_ERROR "CUDA_PREFIX variable is required (e.g. /usr/local/cuda-11.4)" ) endif() if( NOT DEFINED CUDA_ARCH ) message( FATAL_ERROR "CUDA_ARCH variable is required. (e.g. sm_75)" ) endif() add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__) add_link_options(-L${CUDA_PREFIX}/lib64 -pthread) endif( ) # hipcc automatically provides HIP include dirs and HIP platform, # but plain clang needs to be told if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) include_directories( ${HIP_INCLUDE_DIRS} ) if( USE_CUDA ) add_compile_definitions( __HIP_PLATFORM_NVIDIA__ ) else() add_compile_definitions( __HIP_PLATFORM_AMD__ ) endif() endif() # Enable MPI support in rocFFT: option(ROCFFT_MPI_ENABLE "Enable MPI" OFF) option(ROCFFT_CRAY_MPI_ENABLE "Cray MPI" OFF) if( ROCFFT_MPI_ENABLE ) find_package( MPI REQUIRED ) include_directories(SYSTEM ${MPI_INCLUDE_PATH}) endif() add_subdirectory( library ) include( clients/cmake/build-options.cmake ) # Build clients of the library if( BUILD_CLIENTS ) set( BUILD_CLIENTS_BENCH ON ) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) endif( ) # old name for BUILD_CLIENTS_BENCH if( BUILD_CLIENTS_RIDER ) set( BUILD_CLIENTS_BENCH ${BUILD_CLIENTS_RIDER} ) endif() if( BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCH ) if( NOT CLIENTS_OS ) rocm_set_os_id( CLIENTS_OS ) endif() if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP)) set(OPENMP_DEB "libgomp1") set(FFTW_DEB "libfftw3-bin") if(CLIENTS_OS STREQUAL "sles") set(OPENMP_RPM "libgomp1") set(FFTW_RPM "libfftw3-3") else() set(OPENMP_RPM "libgomp") set(FFTW_RPM "fftw-libs") endif() endif() rocm_package_setup_component(clients) if( USE_HIPRAND ) set( HIPRAND_DEP hiprand ) endif() if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component( tests DEPENDS DEB ${OPENMP_DEB} ${FFTW_DEB} ${HIPRAND_DEP} RPM ${OPENMP_RPM} ${FFTW_RPM} ${HIPRAND_DEP} ) endif() if(BUILD_CLIENTS_BENCH) rocm_package_setup_client_component( benchmarks DEPENDS DEB ${HIPRAND_DEP} RPM ${HIPRAND_DEP} ) endif() add_subdirectory( clients ) endif( ) if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) endif() # Package specific CPACK vars string( TOLOWER "${HIP_RUNTIME}" HIP_RUNTIME_LOWER ) if( HIP_RUNTIME_LOWER STREQUAL "rocclr" ) if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() rocm_package_add_dependencies("${DEPENDS_HIP_RUNTIME} >= 4.5.0") endif( ) set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) set( CPACK_RPM_PACKAGE_LICENSE "MIT" ) set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) set( ROCFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" ) set( package_name rocfft ) rocm_create_package( NAME ${package_name} DESCRIPTION "ROCm FFT library" MAINTAINER "rocfft-maintainer@amd.com" LDCONFIG LDCONFIG_DIR ${ROCFFT_CONFIG_DIR} ) rocFFT-rocm-6.4.3/CppCheckSuppressions.txt000066400000000000000000000003421501537341300204650ustar00rootroot00000000000000// generator uses implicit constructors for convenience noExplicitConstructor:library/src/device/generator/generator.h // has some false positives and isn't hard to run manually for periodic // dead code sweeps unusedFunction rocFFT-rocm-6.4.3/LICENSE.md000066400000000000000000000054121501537341300152150ustar00rootroot00000000000000# License Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. This product includes software from copyright holders as shown below, and distributed under their license terms as specified. CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry Schreiner under NSF AWARD 1414736. All rights reserved. Redistribution and use in source and binary forms of CLI11, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. rocFFT-rocm-6.4.3/README.md000066400000000000000000000075201501537341300150720ustar00rootroot00000000000000# rocFFT rocFFT is a software library for computing fast Fourier transforms (FFTs) written in the HIP programming language. It's part of AMD's software ecosystem based on [ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD and NVIDIA GPUs. ## Documentation > [!NOTE] > The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocFFT/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). To build our documentation locally, use the following code: ```Bash cd docs pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Build and install You can install rocFFT using pre-built packages or building from source. * Installing pre-built packages: 1. Download the pre-built packages from the [ROCm package servers](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) or use the GitHub releases tab to download the source (this may give you a more recent version than the pre-built packages). 2. Run: `sudo apt update && sudo apt install rocfft` * Building from source: rocFFT is compiled with AMD's clang++ and uses CMake. You can specify several options to customize your build. The following commands build a shared library for supported AMD GPUs: ```bash mkdir build && cd build cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_C_COMPILER=amdclang .. make -j ``` You can compile a static library using the `-DBUILD_SHARED_LIBS=off` option. With rocFFT, you can use indirect function calls by default; this requires ROCm 4.3 or higher. You can use `-DROCFFT_CALLBACKS_ENABLED=off` with CMake to prevent these calls on older ROCm compilers. Note that with this configuration, callbacks won't work correctly. rocFFT includes the following clients: * `rocfft-bench`: Runs general transforms and is useful for performance analysis * `rocfft-test`: Runs various regression tests * Various small samples | Client | CMake option | Dependencies | |:------|:-----------------|:-----------------| | `rocfft-bench` | `-DBUILD_CLIENTS_BENCH=on` | hipRAND | | `rocfft-test` | `-DBUILD_CLIENTS_TESTS=on` | hipRAND, FFTW, GoogleTest | | samples | `-DBUILD_CLIENTS_SAMPLES=on` | None | Clients are not built by default. To build them, use `-DBUILD_CLIENTS=on`. The build process downloads and builds GoogleTest and FFTW if they are not already installed. Clients can be built separately from the main library. For example, you can build all the clients with an existing rocFFT library by invoking CMake from within the `rocFFT-src/clients` folder: ```bash mkdir build && cd build cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_PREFIX_PATH=/path/to/rocFFT-lib .. make -j ``` To install client dependencies on Ubuntu, run: ```bash sudo apt install libgtest-dev libfftw3-dev libboost-dev ``` We use version 1.11 of GoogleTest. ## Examples A summary of the latest functionality and workflow to compute an FFT with rocFFT is available on the [rocFFT documentation portal](https://rocm.docs.amd.com/projects/rocFFT/en/latest/). You can find additional examples in the `clients/samples` subdirectory. ## Support You can report bugs and feature requests through the GitHub [issue tracker](https://github.com/ROCm/rocFFT/issues). ## Contribute If you want to contribute to rocFFT, you must follow our [contribution guidelines](https://github.com/ROCm/rocFFT/blob/develop/.github/CONTRIBUTING.md). rocFFT-rocm-6.4.3/ValgrindSuppressions.txt000066400000000000000000000002671501537341300205610ustar00rootroot00000000000000{ Memcheck:Param sched_setaffinity(mask) ... fun:hipMalloc } { Memcheck:Param sched_setaffinity(mask) ... fun:hipMemGetInfo }rocFFT-rocm-6.4.3/clients/000077500000000000000000000000001501537341300152505ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/CMakeLists.txt000066400000000000000000000106431501537341300200140ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) set( CPACK_PACKAGING_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) set( CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_CLIENTS_BUILD_SCOPE ON ) # This project may compile dependencies for clients project( rocfft-clients LANGUAGES CXX C ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT ROCM_FOUND ) find_package( ROCM 0.7.3 REQUIRED ) endif() include( ROCMInstallTargets ) # Adding Version File to rocfft-client, this avoids empty rocfft-client package file ( WRITE "${PROJECT_BINARY_DIR}/package/client-version" "${rocfft_VERSION_MAJOR}.${rocfft_VERSION_MINOR}.${rocfft_VERSION_PATCH}-${BUILD_ID}\n" ) rocm_install ( FILES ${PROJECT_BINARY_DIR}/package/client-version DESTINATION .info COMPONENT clients) # This option only works for make/nmake and the ninja generators, but # no reason it shouldn't be on all the time. # This tells cmake to create a compile_commands.json file that can be # used with clang tooling or vim. set( CMAKE_EXPORT_COMPILE_COMMANDS ON ) if(NOT ROCFFT_BUILD_SCOPE AND NOT BUILD_CLIENTS_SAMPLES AND NOT BUILD_CLIENTS_TESTS AND NOT BUILD_CLIENTS_BENCH) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) set( BUILD_CLIENTS_BENCH ON ) endif() # each backend requires different libraries for host and device code if( USE_CUDA ) if( NOT DEFINED CUDA_PREFIX ) message( FATAL_ERROR "CUDA_PREFIX variable is required." ) endif() if( NOT DEFINED CUDA_ARCH ) message( FATAL_ERROR "CUDA_ARCH variable is required." ) endif() add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__) add_link_options(-L${CUDA_PREFIX}/lib64 -pthread) add_compile_options(--cuda-path=${CUDA_PREFIX} --cuda-gpu-arch=${CUDA_ARCH} -xcuda) set( ROCFFT_CLIENTS_HOST_LINK_LIBS -lcudart -ldl -lrt ) else() set( ROCFFT_CLIENTS_HOST_LINK_LIBS hip::host ) set( ROCFFT_CLIENTS_DEVICE_LINK_LIBS hip::device ) endif() if( ROCFFT_MPI_ENABLE ) find_package( MPI REQUIRED ) endif() if( BUILD_CLIENTS_SAMPLES ) add_subdirectory( samples ) endif( ) if( BUILD_CLIENTS_TESTS ) add_subdirectory( tests ) endif( ) if( BUILD_CLIENTS_BENCH ) add_subdirectory( bench ) endif( ) rocFFT-rocm-6.4.3/clients/bench/000077500000000000000000000000001501537341300163275ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/bench/CMakeLists.txt000066400000000000000000000136261501537341300210770ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-bench LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( NOT ROCM_FOUND ) find_package( ROCM 0.7.3 REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) set( bench_list rocfft-bench dyna-rocfft-bench ) foreach( bench ${bench_list}) if(${bench} STREQUAL "rocfft-bench") add_executable( ${bench} ../../shared/array_validator.cpp bench.cpp bench.h ) else() add_executable( ${bench} ../../shared/array_validator.cpp dyna-bench.cpp bench.h ) endif() target_compile_options( ${bench} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) # NB: hip-clang includes omp.h, so we need to specify the location # of ROCM_CLANG_ROOT at cmake config time if we are using clang++. target_include_directories( ${bench} PRIVATE $ ${HIP_CLANG_ROOT}/include ${ROCM_CLANG_ROOT}/include ) if(${bench} STREQUAL "rocfft-bench") target_link_libraries( ${bench} PRIVATE hip::device roc::rocfft ) else() target_link_libraries( ${bench} PRIVATE ${CMAKE_DL_LIBS} hip::device ) endif() if( USE_HIPRAND ) target_link_libraries( ${bench} PRIVATE hip::hiprand ) endif() # We need to include both rocfft.h and rocfft-export.h target_include_directories( ${bench} PRIVATE ${CMAKE_BINARY_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/ ${HIP_CLANG_ROOT}/include ) target_link_libraries( ${bench} PUBLIC ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) if( ROCFFT_MPI_ENABLE ) target_link_libraries( ${bench} PRIVATE MPI::MPI_CXX ) if ( ROCFFT_CRAY_MPI_ENABLE) target_link_libraries( ${bench} PRIVATE "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( ${bench} PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) endif() endif() set_target_properties( ${bench} PROPERTIES CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../bin" ) else() set( BENCH_OUT_DIR "/bin") endif() string( CONCAT BENCH_OUT_DIR "${PROJECT_BINARY_DIR}" ${BENCH_OUT_DIR} ) set_target_properties(${bench} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BENCH_OUT_DIR} ) rocm_install(TARGETS ${bench} COMPONENT benchmarks) # install compatibility for old name of bench program - symlink on # unix, hardlink on windows (since privilege is required to create # symlinks there) string(REPLACE bench rider bench_legacy ${bench}) if( WIN32 ) set( BENCH_LINK_COMMAND create_hardlink ) set( BENCH_NEW_NAME ${BENCH_OUT_DIR}/$${CMAKE_EXECUTABLE_SUFFIX} ) set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy}${CMAKE_EXECUTABLE_SUFFIX} ) else() set( BENCH_LINK_COMMAND create_symlink ) set( BENCH_NEW_NAME $ ) set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy} ) endif() add_custom_command( TARGET ${bench} POST_BUILD COMMAND ${CMAKE_COMMAND} -E ${BENCH_LINK_COMMAND} ${BENCH_NEW_NAME} ${BENCH_OLD_NAME} ) install( FILES ${BENCH_OLD_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT benchmarks ) endforeach() # Link dyna-rocfft-bench to the experimental filesystem library if # it's not available in the standard library. include( ../../cmake/std-filesystem.cmake ) target_link_std_experimental_filesystem( dyna-rocfft-bench ) rocFFT-rocm-6.4.3/clients/bench/bench.cpp000066400000000000000000000456711501537341300201270ustar00rootroot00000000000000// Copyright (C) 2016 - 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/CLI11.hpp" #include "../../shared/arithmetic.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_params.h" #include "bench.h" #include "rocfft/rocfft.h" int main(int argc, char* argv[]) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio(false); // Control output verbosity: int verbose{}; // number of GPUs to use: int ngpus{}; // hip Device number for running tests: int deviceId{}; // Ignore runtime failures. // eg: hipMalloc failing when there isn't enough free vram. bool ignore_hip_runtime_failures{true}; // Number of performance trial samples int ntrial{}; // FFT parameters: rocfft_params params; // input/output FFT grids std::vector ingrid; std::vector outgrid; // Token string to fully specify fft params. std::string token; CLI::App app{"rocfft-bench command line options"}; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("--version", "Print queryable version information from the rocfft library") ->each([](const std::string&) { char v[256]; rocfft_get_version_string(v, 256); std::cout << "version " << v << std::endl; return EXIT_SUCCESS; }); CLI::Option* opt_token = app.add_option("--token", token, "Token to read FFT params from")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); CLI::Option* opt_not_in_place = non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); CLI::Option* opt_length = non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3); non_token->add_option("--ngpus", ngpus, "Number of GPUs to use") ->default_val(1) ->check(CLI::NonNegativeNumber); // define multi-GPU grids for FFT computation, CLI::Option* opt_ingrid = non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input") ->expected(1, 3) ->needs("--ngpus"); CLI::Option* opt_outgrid = non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output") ->expected(1, 3) ->needs("--ngpus"); non_token ->add_option("-b, --batchSize", params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides"); CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides"); non_token->add_option("--idist", params.idist, "Logical distance between input batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; }); non_token->add_option("--odist", params.odist, "Logical distance between output batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; }); CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset"); CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset"); app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures", ignore_hip_runtime_failures, "Ignore hip runtime failures"); app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0); app.add_option("-N, --ntrial", ntrial, "Trial size for the problem") ->default_val(1) ->each([&](const std::string& val) { std::cout << "Running profile with " << val << " samples\n"; }); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); app.add_option("--isize", params.isize, "Logical size of input buffer"); app.add_option("--osize", params.osize, "Logical size of output buffer"); app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output"); // Parse args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(!token.empty()) { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } std::cout << std::flush; } else // generate token { if(ngpus > 1) { // set default GPU grids in case none were given params.set_default_grid(ngpus, ingrid, outgrid); // split the problem among ngpus params.mp_lib = fft_params::fft_mp_lib_none; int localDeviceCount = 0; (void)hipGetDeviceCount(&localDeviceCount); // start with all-ones in grids std::vector input_grid(params.length.size() + 1, 1); std::vector output_grid(params.length.size() + 1, 1); // create input and output grids and distribute it according to user requirements std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1); std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1); params.distribute_input(localDeviceCount, input_grid); params.distribute_output(localDeviceCount, output_grid); } if(*opt_not_in_place) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(*opt_length) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(*opt_istride) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ostride) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ioffset) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ooffset) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ingrid || !ingrid.empty()) { std::cout << "input grid:"; for(auto& i : ingrid) std::cout << " " << i; std::cout << "\n"; } if(*opt_outgrid || !outgrid.empty()) { std::cout << "output grid:"; for(auto& i : outgrid) std::cout << " " << i; std::cout << "\n"; } std::cout << "\n"; } std::cout << std::flush; rocfft_setup(); // Set GPU for single-device FFT computation rocfft_scoped_device dev(deviceId); params.validate(); if(!params.valid(verbose)) { throw std::runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str(" ") << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; try { HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } const auto vram_footprint = params.vram_footprint(); if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } auto ret = params.create_plan(); if(ret != fft_status_success) LIB_V_THROW(rocfft_status_failure, "Plan creation failed"); // GPU input buffer: auto ibuffer_sizes = params.ibuffer_sizes(); std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { try { HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } pibuffer[i] = ibuffer[i].data(); } // CPU-side input buffer std::vector ibuffer_cpu; auto is_host_gen = (params.igen == fft_input_generator_host || params.igen == fft_input_random_generator_host); #ifdef USE_HIPRAND if(!is_host_gen) { // Input data: params.compute_input(ibuffer); if(verbose > 1) { // Copy input to CPU try { ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { try { HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } } #endif if(is_host_gen) { // Input data: ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); params.compute_input(ibuffer_cpu); if(verbose > 1) { std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { try { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } } // GPU output buffer: std::vector obuffer_data; std::vector* obuffer = &obuffer_data; if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed"); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // Scatter input out to other devices and adjust I/O buffers to match requested transform params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); // Execute a warm-up call params.execute(pibuffer.data(), pobuffer.data()); // Run the transform several times and record the execution time: std::vector gpu_time(ntrial); hipEvent_wrapper_t start, stop; start.alloc(); stop.alloc(); for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial) { // Create input at every iteration to avoid overflow if(params.ifields.empty()) { #ifdef USE_HIPRAND // Compute input on default device if(!is_host_gen) params.compute_input(ibuffer); #endif if(is_host_gen) { for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { try { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } } // Scatter input out to other devices if this is a multi-GPU test params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer); } HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); params.execute(pibuffer.data(), pobuffer.data()); HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed"); HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed"); float time; HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed"); gpu_time[itrial] = time; // Print result after FFT transform if(verbose > 2) { // Gather data to default GPU if this is a multi-GPU test params.multi_gpu_finalize(*obuffer, pobuffer); auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int idx = 0; idx < output.size(); ++idx) { try { HIP_V_THROW(hipMemcpy(output[idx].data(), pobuffer.at(idx), output[idx].size(), hipMemcpyDeviceToHost), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::cout << "GPU output:\n"; params.print_obuffer(output); } } std::cout << "\nExecution gpu time:"; for(const auto& i : gpu_time) { std::cout << " " << i; } std::cout << " ms" << std::endl; std::cout << "Execution gflops: "; const double totsize = product(params.length.begin(), params.length.end()); const double k = ((params.itype == fft_array_type_real) || (params.otype == fft_array_type_real)) ? 2.5 : 5.0; const double opscount = (double)params.nbatch * k * totsize * log(totsize) / log(2.0); for(const auto& i : gpu_time) { std::cout << " " << opscount / (1e6 * i); } std::cout << std::endl; rocfft_cleanup(); } rocFFT-rocm-6.4.3/clients/bench/bench.h000066400000000000000000000061561501537341300175670ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_BENCH_H #define ROCFFT_BENCH_H #include "rocfft/rocfft.h" #include #include class rocfft_hip_runtime_error : public std::runtime_error { public: rocfft_hip_runtime_error(const std::string& msg = "") : runtime_error(msg) { } }; // This is used to either wrap a HIP function call, or to explicitly check a variable // for an error condition. If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so only strings // supported inline void hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != hipSuccess) { std::stringstream tmp; tmp << "HIP_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw rocfft_hip_runtime_error(errorm); } } class rocfft_runtime_error : public std::runtime_error { public: rocfft_runtime_error(const std::string& msg = "") : runtime_error(msg) { } }; inline void lib_V_Throw(rocfft_status res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != rocfft_status_success) { std::stringstream tmp; tmp << "LIB_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw rocfft_runtime_error(errorm); } } #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__) #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__) #endif // ROCFFT_BENCH_H rocFFT-rocm-6.4.3/clients/bench/dyna-bench.cpp000066400000000000000000000774711501537341300210630ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // This file allows one to run tests multiple different rocFFT libraries at the same time. // This allows one to randomize the execution order for better a better experimental setup // which produces fewer type 1 errors where one incorrectly rejects the null hypothesis. #include #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif #include #include #include #include #ifdef WIN32 #include // psapi.h requires windows.h to be included first #include #else #include #include #endif #include "../../shared/CLI11.hpp" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_params.h" #include "bench.h" #include "rocfft/rocfft.h" #ifdef WIN32 typedef HMODULE ROCFFT_LIB; #else typedef void* ROCFFT_LIB; #endif // Load the rocfft library ROCFFT_LIB rocfft_lib_load(const std::string& path) { #ifdef WIN32 return LoadLibraryA(path.c_str()); #else return dlopen(path.c_str(), RTLD_LAZY); #endif } // Return a string describing the error loading rocfft const char* rocfft_lib_load_error() { #ifdef WIN32 // just return the error number static std::string error_str; error_str = std::to_string(GetLastError()); return error_str.c_str(); #else return dlerror(); #endif } // Get symbol from rocfft lib void* rocfft_lib_symbol(ROCFFT_LIB libhandle, const char* sym) { #ifdef WIN32 return reinterpret_cast(GetProcAddress(libhandle, sym)); #else return dlsym(libhandle, sym); #endif } void rocfft_lib_close(ROCFFT_LIB libhandle) { #ifdef WIN32 FreeLibrary(libhandle); #else dlclose(libhandle); #endif } // Given a libhandle from dload, return a plan to a rocFFT plan with the given parameters. rocfft_plan make_plan(ROCFFT_LIB libhandle, const fft_params& params) { auto procfft_setup = (decltype(&rocfft_setup))rocfft_lib_symbol(libhandle, "rocfft_setup"); if(procfft_setup == NULL) throw rocfft_runtime_error("rocfft_setup failed"); auto procfft_plan_description_create = (decltype(&rocfft_plan_description_create))rocfft_lib_symbol( libhandle, "rocfft_plan_description_create"); auto procfft_plan_description_destroy = (decltype(&rocfft_plan_description_destroy))rocfft_lib_symbol( libhandle, "rocfft_plan_description_destroy"); auto procfft_plan_description_set_data_layout = (decltype(&rocfft_plan_description_set_data_layout))rocfft_lib_symbol( libhandle, "rocfft_plan_description_set_data_layout"); auto procfft_plan_create = (decltype(&rocfft_plan_create))rocfft_lib_symbol(libhandle, "rocfft_plan_create"); procfft_setup(); rocfft_plan_description desc = NULL; LIB_V_THROW(procfft_plan_description_create(&desc), "rocfft_plan_description_create failed"); LIB_V_THROW( procfft_plan_description_set_data_layout(desc, rocfft_array_type_from_fftparams(params.itype), rocfft_array_type_from_fftparams(params.otype), params.ioffset.data(), params.ooffset.data(), params.istride.size(), params.istride.data(), params.idist, params.ostride.size(), params.ostride.data(), params.odist), "rocfft_plan_description_data_layout failed"); rocfft_plan plan = NULL; LIB_V_THROW(procfft_plan_create(&plan, rocfft_result_placement_from_fftparams(params.placement), rocfft_transform_type_from_fftparams(params.transform_type), rocfft_precision_from_fftparams(params.precision), params.length.size(), params.length.data(), params.nbatch, desc), "rocfft_plan_create failed"); LIB_V_THROW(procfft_plan_description_destroy(desc), "rocfft_plan_description_destroy failed"); return plan; } // Given a libhandle from dload and a rocFFT plan, destroy the plan. void destroy_plan(ROCFFT_LIB libhandle, rocfft_plan& plan) { auto procfft_plan_destroy = (decltype(&rocfft_plan_destroy))rocfft_lib_symbol(libhandle, "rocfft_plan_destroy"); LIB_V_THROW(procfft_plan_destroy(plan), "rocfft_plan_destroy failed"); auto procfft_cleanup = (decltype(&rocfft_cleanup))rocfft_lib_symbol(libhandle, "rocfft_cleanup"); if(procfft_cleanup) LIB_V_THROW(procfft_cleanup(), "rocfft_cleanup failed"); } // Given a libhandle from dload and a rocFFT execution info structure, destroy the info. void destroy_info(ROCFFT_LIB libhandle, rocfft_execution_info& info) { auto procfft_execution_info_destroy = (decltype(&rocfft_execution_info_destroy))rocfft_lib_symbol( libhandle, "rocfft_execution_info_destroy"); LIB_V_THROW(procfft_execution_info_destroy(info), "rocfft_execution_info_destroy failed"); } // Given a libhandle from dload, and a corresponding rocFFT plan, return how much work // buffer is required. size_t get_wbuffersize(ROCFFT_LIB libhandle, const rocfft_plan& plan) { auto procfft_plan_get_work_buffer_size = (decltype(&rocfft_plan_get_work_buffer_size))rocfft_lib_symbol( libhandle, "rocfft_plan_get_work_buffer_size"); // Get the buffersize size_t workBufferSize = 0; LIB_V_THROW(procfft_plan_get_work_buffer_size(plan, &workBufferSize), "rocfft_plan_get_work_buffer_size failed"); return workBufferSize; } // Given a libhandle from dload and a corresponding rocFFT plan, print the plan information. void show_plan(ROCFFT_LIB libhandle, const rocfft_plan& plan) { auto procfft_plan_get_print = (decltype(&rocfft_plan_get_print))rocfft_lib_symbol(libhandle, "rocfft_plan_get_print"); LIB_V_THROW(procfft_plan_get_print(plan), "rocfft_plan_get_print failed"); } // FIXME: doc rocfft_execution_info make_execinfo(ROCFFT_LIB libhandle) { auto procfft_execution_info_create = (decltype(&rocfft_execution_info_create))rocfft_lib_symbol( libhandle, "rocfft_execution_info_create"); rocfft_execution_info info = NULL; LIB_V_THROW(procfft_execution_info_create(&info), "rocfft_execution_info_create failed"); return info; } // FIXME: doc void set_work_buffer(const ROCFFT_LIB& libhandle, rocfft_execution_info& info, const size_t wbuffersize, void* wbuffer) { if(wbuffersize > 0 && wbuffer != NULL) { auto procfft_execution_info_set_work_buffer = (decltype(&rocfft_execution_info_set_work_buffer))rocfft_lib_symbol( libhandle, "rocfft_execution_info_set_work_buffer"); LIB_V_THROW(procfft_execution_info_set_work_buffer(info, wbuffer, wbuffersize), "rocfft_execution_info_set_work_buffer failed"); } } // Given a libhandle from dload and a corresponding rocFFT plan and execution info, // execute a transform on the given input and output buffers and return the kernel // execution time. float run_plan( ROCFFT_LIB libhandle, rocfft_plan plan, rocfft_execution_info info, void** in, void** out) { auto procfft_execute = (decltype(&rocfft_execute))rocfft_lib_symbol(libhandle, "rocfft_execute"); hipEvent_wrapper_t start, stop; start.alloc(); stop.alloc(); HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); auto rcfft = procfft_execute(plan, in, out, info); HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed"); HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed"); if(rcfft != rocfft_status_success) { throw std::runtime_error("execution failed"); } float time; HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed"); return time; } std::pair create_handleplan(const std::string& libstring, const fft_params& params) { auto libhandle = rocfft_lib_load(libstring); if(libhandle == NULL) { std::stringstream ss; ss << "Failed to open " << libstring << ", error: " << rocfft_lib_load_error(); throw std::runtime_error(ss.str()); } auto plan = make_plan(libhandle, params); return std::make_pair(libhandle, plan); } int main(int argc, char* argv[]) { // Control output verbosity: int verbose{}; // number of GPUs to use: int ngpus{}; // hip Device number for running tests: int deviceId{}; // Ignore runtime failures. // eg: hipMalloc failing when there isn't enough free vram. bool ignore_hip_runtime_failures{true}; // Number of performance trial samples: int ntrial{}; // Bool to specify whether the libs are loaded in forward or forward+reverse order. int reverse{}; // Test sequence choice: int test_sequence{}; // Vector of test target libraries std::vector lib_strings; // FFT parameters: fft_params params; // input/output FFT grids std::vector ingrid; std::vector outgrid; // Token string to fully specify fft params. std::string token; CLI::App app{"dyna-rocfft-bench command line options"}; // Declare the supported options. Some option pointers are declared to track passed opts. // FIXME: version needs to be implemented app.add_flag("--version", "Print queryable version information from the rocfft library and exit"); app.add_flag("--reverse", reverse, "Load libs in forward and reverse order")->default_val(1); app.add_option( "--sequence", test_sequence, "Test sequence:\n0) random\n1) alternating\n2) sequential") ->default_val(0); app.add_option("--lib", lib_strings, "Set test target library full path (appendable)"); CLI::Option* opt_token = app.add_option("--token", token, "Token to read FFT params from")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); CLI::Option* opt_not_in_place = non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); CLI::Option* opt_length = non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3); non_token->add_option("--ngpus", ngpus, "Number of GPUs to use") ->default_val(1) ->check(CLI::NonNegativeNumber); // define multi-GPU grids for FFT computation, CLI::Option* opt_ingrid = non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input") ->expected(1, 3) ->needs("--ngpus"); CLI::Option* opt_outgrid = non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output") ->expected(1, 3) ->needs("--ngpus"); non_token ->add_option("-b, --batchSize", params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides"); CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides"); non_token->add_option("--idist", params.idist, "Logical distance between input batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; }); non_token->add_option("--odist", params.odist, "Logical distance between output batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; }); CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset"); CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset"); app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures", ignore_hip_runtime_failures, "Ignore hip runtime failures"); app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0); app.add_option("-N, --ntrial", ntrial, "Trial size for the problem") ->default_val(1) ->each([&](const std::string& val) { std::cout << "Running profile with " << val << " samples\n"; }); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); app.add_option("--isize", params.isize, "Logical size of input buffer"); app.add_option("--osize", params.osize, "Logical size of output buffer"); app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output"); // Parse args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // Check if all the provided libraries are actually there: for(const auto& lib_string : lib_strings) { if(!std::filesystem::exists(lib_string)) { std::cerr << "Error: lib " << lib_string << " does not exist\n"; return EXIT_FAILURE; } } if(!token.empty()) { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } } else { if(ngpus > 1) { // set default GPU grids in case none were given params.set_default_grid(ngpus, ingrid, outgrid); // split the problem among ngpus params.mp_lib = fft_params::fft_mp_lib_none; int localDeviceCount = 0; (void)hipGetDeviceCount(&localDeviceCount); // start with all-ones in grids std::vector input_grid(params.length.size() + 1, 1); std::vector output_grid(params.length.size() + 1, 1); // create input and output grids and distribute it according to user requirements std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1); std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1); params.distribute_input(localDeviceCount, input_grid); params.distribute_output(localDeviceCount, output_grid); } if(*opt_not_in_place) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(*opt_length) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(*opt_istride) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ostride) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ioffset) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ooffset) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ingrid || !ingrid.empty()) { std::cout << "input grid:"; for(auto& i : ingrid) std::cout << " " << i; std::cout << "\n"; } if(*opt_outgrid || !outgrid.empty()) { std::cout << "output grid:"; for(auto& i : outgrid) std::cout << " " << i; std::cout << "\n"; } } std::cout << std::flush; // Set GPU for single-device FFT computation rocfft_scoped_device dev(deviceId); params.validate(); if(!params.valid(verbose)) { throw rocfft_runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str() << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; try { HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } // GPU input buffer: auto ibuffer_sizes = params.ibuffer_sizes(); std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { try { HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } pibuffer[i] = ibuffer[i].data(); } // CPU-side input buffer std::vector ibuffer_cpu; auto is_host_gen = (params.igen == fft_input_generator_host || params.igen == fft_input_random_generator_host); #ifdef USE_HIPRAND if(!is_host_gen) { // Input data: params.compute_input(ibuffer); if(verbose > 1) { // Copy input to CPU ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { try { HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } } #endif if(is_host_gen) { // Input data: ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); params.compute_input(ibuffer_cpu); if(verbose > 1) { std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { try { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } } // GPU output buffer: std::vector obuffer_data; std::vector* obuffer = &obuffer_data; if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { try { HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // Execution times for loaded libraries: std::vector> time(lib_strings.size()); // If we are doing a reverse-run, then we need two ntrials; otherwise, just one. std::vector ntrial_runs; if(reverse == 0) { ntrial_runs.push_back(ntrial); } else { ntrial_runs.push_back((ntrial + 1) / 2); ntrial_runs.push_back(ntrial / 2); } for(size_t ridx = 0; ridx < ntrial_runs.size(); ++ridx) { std::vector> index_lib_string; for(size_t i = 0; i < lib_strings.size(); ++i) { index_lib_string.push_back(std::make_pair(i, lib_strings[i])); } if(ridx == 1) { std::reverse(index_lib_string.begin(), index_lib_string.end()); } // Create the handles to the libs and the associated fft plans. std::vector handle; std::vector plan; // Allocate the work buffer: just one, big enough for any dloaded library. std::vector info; size_t wbuffer_size = 0; for(unsigned int idx = 0; idx < lib_strings.size(); ++idx) { std::cout << idx << ": " << lib_strings[idx] << "\n"; auto libhandle = rocfft_lib_load(lib_strings[idx]); if(libhandle == NULL) { std::cout << "Failed to open " << lib_strings[idx] << ", error: " << rocfft_lib_load_error() << "\n"; return 1; } handle.push_back(libhandle); plan.push_back(make_plan(handle[idx], params)); show_plan(handle[idx], plan[idx]); wbuffer_size = std::max(wbuffer_size, get_wbuffersize(handle[idx], plan[idx])); info.push_back(make_execinfo(handle[idx])); } std::cout << "Work buffer size: " << wbuffer_size << std::endl; if(!vram_fits_problem(raw_vram_footprint + wbuffer_size, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << " + " << +wbuffer_size << " = " << raw_vram_footprint + wbuffer_size << " ) data too large for device.\n"; return EXIT_SUCCESS; } gpubuf wbuffer; if(wbuffer_size) { try { HIP_V_THROW(wbuffer.alloc(wbuffer_size), "Creating intermediate Buffer failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } // Associate the work buffer to the individual libraries: for(unsigned int idx = 0; idx < lib_strings.size(); ++idx) { set_work_buffer(handle[idx], info[idx], wbuffer_size, wbuffer.data()); } // Run the plan using its associated rocFFT library: for(unsigned int idx = 0; idx < handle.size(); ++idx) { try { run_plan(handle[idx], plan[idx], info[idx], pibuffer.data(), pobuffer.data()); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::vector testcase(ntrial_runs[ridx] * index_lib_string.size()); switch(test_sequence) { case 0: { // Random order: for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial) { for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib) { testcase[index_lib_string.size() * itrial + ilib] = ilib; } } std::random_device rd; std::mt19937 g(rd()); std::shuffle(testcase.begin(), testcase.end(), g); break; } case 1: // Alternating order: for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial) { for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib) { testcase[index_lib_string.size() * itrial + ilib] = ilib; } } break; case 2: // Sequential order: for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial) { for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib) { testcase[ilib * ntrial + itrial] = ilib; } } break; default: throw std::runtime_error("Invalid test sequence choice."); } if(verbose > 3) { std::cout << "Test case order:"; for(const auto val : testcase) std::cout << " " << val; std::cout << "\n"; } std::cout << "Running the tests...\n"; for(size_t itest = 0; itest < testcase.size(); ++itest) { const int tidx = testcase[itest]; if(verbose > 3) { std::cout << "running test case " << tidx << " with lib " << index_lib_string[tidx].second << "\n"; } #ifdef USE_HIPRAND if(!is_host_gen) params.compute_input(ibuffer); #endif if(is_host_gen) { for(unsigned int bidx = 0; bidx < ibuffer_cpu.size(); ++bidx) { try { HIP_V_THROW(hipMemcpy(pibuffer[bidx], ibuffer_cpu[bidx].data(), ibuffer_cpu[bidx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } } // Run the plan using its associated rocFFT library: try { time[tidx].push_back(run_plan( handle[tidx], plan[tidx], info[tidx], pibuffer.data(), pobuffer.data())); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } if(verbose > 2) { auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int iout = 0; iout < output.size(); ++iout) { try { HIP_V_THROW(hipMemcpy(output[iout].data(), pobuffer[iout], output[iout].size(), hipMemcpyDeviceToHost), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::cout << "GPU output:\n"; params.print_obuffer(output); } } // Clean up: for(unsigned int hidx = 0; hidx < handle.size(); ++hidx) { destroy_info(handle[hidx], info[hidx]); destroy_plan(handle[hidx], plan[hidx]); rocfft_lib_close(handle[hidx]); } } std::cout << "Execution times in ms:\n"; for(unsigned int idx = 0; idx < time.size(); ++idx) { std::cout << "\nExecution gpu time:"; for(auto& i : time[idx]) { std::cout << " " << i; } std::cout << " ms" << std::endl; } return EXIT_SUCCESS; } rocFFT-rocm-6.4.3/clients/cmake/000077500000000000000000000000001501537341300163305ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/cmake/build-gtest.cmake000066400000000000000000000046041501537341300215610ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include( ExternalProject ) option( BUILD_GTEST "Download and build GoogleTest" OFF ) if( NOT BUILD_GTEST ) find_package( GTest 1.11.0 ) endif() if( (BUILD_GTEST OR NOT GTEST_FOUND) AND (NOT TARGET gtest) ) set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include) set(GTEST_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}) set(GTEST_SRC_URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz CACHE STRING "Location of GTest source code") set(GTEST_SRC_SHA256 b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 CACHE STRING "SHA256 hash of GTest source code") ExternalProject_Add(gtest URL ${GTEST_SRC_URL} URL_HASH SHA256=${GTEST_SRC_SHA256} PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} INSTALL_COMMAND "" BUILD_BYPRODUCTS ${GTEST_LIBRARIES}) ExternalProject_Get_Property( gtest source_dir binary_dir ) endif() rocFFT-rocm-6.4.3/clients/cmake/build-options.cmake000066400000000000000000000036001501537341300221210ustar00rootroot00000000000000# Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # This file is intended to be used in two ways; independently in a stand alone PROJECT # and as part of a superbuild. If the file is included in a stand alone project, the # variables are not expected to be preset, and this will produce options() in the GUI # for the user to examine. If this file is included in a superbuild, the options will be # presented in the superbuild GUI, but then passed into the ExternalProject as -D # parameters, which would already define them. if( NOT BUILD_CLIENTS_TESTS ) option( BUILD_CLIENTS_TESTS "Build rocFFT unit tests" OFF ) endif( ) if( NOT BUILD_CLIENTS_BENCH ) option( BUILD_CLIENTS_BENCH "Build rocFFT benchmarks" OFF ) endif( ) if( NOT BUILD_CLIENTS_SAMPLES ) option( BUILD_CLIENTS_SAMPLES "Build rocFFT samples" OFF ) endif( ) rocFFT-rocm-6.4.3/clients/samples/000077500000000000000000000000001501537341300167145ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/samples/CMakeLists.txt000066400000000000000000000050531501537341300214570ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ON ) project( rocfft-clients-samples LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) list( APPEND samples_subdirs "fixed-16" ) list( APPEND samples_subdirs "fixed-large" ) list( APPEND samples_subdirs "rocfft" ) list( APPEND samples_subdirs "multi_gpu" ) if( ROCFFT_MPI_ENABLE ) list( APPEND samples_subdirs "mpi" ) endif() foreach( client ${samples_subdirs} ) add_subdirectory( ${client} ) endforeach( ) rocFFT-rocm-6.4.3/clients/samples/fixed-16/000077500000000000000000000000001501537341300202375ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/samples/fixed-16/CMakeLists.txt000066400000000000000000000072701501537341300230050ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-fixed-16 LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() set( sample_list fixed-16-float fixed-16-double fixed-16-half ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../bin" ) else() set( FIXED_16_OUT_DIR "/bin" ) endif() string( CONCAT FIXED_16_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_16_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${FIXED_16_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) endforeach( ) rocFFT-rocm-6.4.3/clients/samples/fixed-16/fixed-16-double.cpp000066400000000000000000000114341501537341300235410ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/fixed-16/fixed-16-float.cpp000066400000000000000000000114301501537341300233700ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/fixed-16/fixed-16-half.cpp000066400000000000000000000116601501537341300232020ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector<_Float16_2> cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7)); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(_Float16_2); // Create HIP device object. _Float16_2* x = nullptr; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_half, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector<_Float16_2> y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << static_cast(cx[i].x) << "," << static_cast(cx[i].y) << ")" << " output: (" << static_cast(y[i].x) << "," << static_cast(y[i].y) << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/fixed-large/000077500000000000000000000000001501537341300211035ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/samples/fixed-large/CMakeLists.txt000066400000000000000000000072741501537341300236550ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-fixed-large LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() set( sample_list fixed-large-float fixed-large-double ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../bin" ) else() set( FIXED_LARGE_OUT_DIR "/bin" ) endif() string( CONCAT FIXED_LARGE_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_LARGE_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${FIXED_LARGE_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) endforeach( ) rocFFT-rocm-6.4.3/clients/samples/fixed-large/fixed-large-double.cpp000066400000000000000000000116541501537341300252550ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include "rocfft/rocfft.h" #include #include int main() { // For size N >= 8192, temporary buffer is required to allocated const size_t N = 64 * 2048; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; if(workBuffer) if(hipFree(workBuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/fixed-large/fixed-large-float.cpp000066400000000000000000000116501501537341300251040ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include "rocfft/rocfft.h" #include #include int main() { // For size N >= 8192, temporary buffer is required to allocated const size_t N = 64 * 2048; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; if(workBuffer) if(hipFree(workBuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/mpi/000077500000000000000000000000001501537341300175015ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/samples/mpi/CMakeLists.txt000066400000000000000000000105621501537341300222450ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-rocfft LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( NOT MPI_FOUND ) find_package( MPI REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() set( sample_list rocfft_mpi_example ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ${MPI_CXX_INCLUDE_PATH} ) target_link_libraries( ${sample} PRIVATE roc::rocfft MPI::MPI_CXX ) message( "MPI_CXX_LIB_NAMES: ${MPI_CXX_LIB_NAMES}") if ( ROCFFT_CRAY_MPI_ENABLE ) target_link_libraries( ${sample} PRIVATE "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( ${sample} PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) endif() if ( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) endif() target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) rocFFT-rocm-6.4.3/clients/samples/mpi/rocfft_mpi_example.cpp000066400000000000000000000377321501537341300240640ustar00rootroot00000000000000 /****************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include #include #include #include #include #include "rocfft.h" // Check all ranks for an rocFFT non-success status. auto rocfft_status_sync(const rocfft_status fftrc, const MPI_Comm comm) { // Since hipSuccess is the lowest enum value, we can find if there are any errors // by getting the maximum value of the return code over all procs. // Guarantee that the enum is an unsigned int so that we can send this via MPI: static_assert(std::is_same_v, unsigned int>); auto global_fftrc = rocfft_status_success; const auto mpirc = MPI_Allreduce(&fftrc, &global_fftrc, 1, MPI_UNSIGNED, MPI_MAX, comm); if(mpirc != MPI_SUCCESS) { return rocfft_status_failure; } return global_fftrc; } // Check all ranks for an hip runtime non-success status. auto hip_status_sync(const hipError_t hiprc, const MPI_Comm comm) { // Since rocfft_status_success is the lowest enum value, we can find if there are any errors // by getting the maximum value of the return code over all procs. // Guarantee that the enum is an unsigned int so that we can send this via MPI: static_assert(std::is_same_v, unsigned int>); auto global_hiprc = hipSuccess; const auto mpirc = MPI_Allreduce(&hiprc, &global_hiprc, 1, MPI_UNSIGNED, MPI_MAX, comm); if(mpirc != MPI_SUCCESS) { return hipErrorUnknown; } return global_hiprc; } int main(int argc, char** argv) { MPI_Init(&argc, &argv); MPI_Comm mpi_comm = MPI_COMM_WORLD; int mpi_size = 0; MPI_Comm_size(mpi_comm, &mpi_size); int mpi_rank = 0; MPI_Comm_rank(mpi_comm, &mpi_rank); if(mpi_rank == 0) { std::cout << "rocFFT MPI example\n"; std::cout << "MPI size: " << mpi_size << "\n"; } // General FFT parameters: std::vector length = {8, 8}; const rocfft_transform_type direction = rocfft_transform_type_complex_forward; const rocfft_result_placement place = rocfft_placement_notinplace; auto fftrc = rocfft_status_success; auto hiprc = hipSuccess; fftrc = rocfft_setup(); if(fftrc != rocfft_status_success) throw std::runtime_error("failed to set up rocFFT"); rocfft_plan_description description = nullptr; rocfft_plan_description_create(&description); fftrc = rocfft_plan_description_set_comm(description, rocfft_comm_mpi, &mpi_comm); if(fftrc != rocfft_status_success) throw std::runtime_error("failed add communicator to description"); // Do not set stride information via the descriptor, they are to be defined during field // creation below fftrc = rocfft_plan_description_set_data_layout(description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, 0, nullptr, 0, 0, nullptr, 0); if(fftrc != rocfft_status_success) throw std::runtime_error("failed to create description"); if(mpi_rank == 0) { std::cout << "input data decomposition:\n"; } std::vector gpu_in = {nullptr}; { rocfft_field infield = nullptr; rocfft_field_create(&infield); std::vector inbrick_stride = {1, length[1]}; const size_t inbrick_length1 = length[1] / (size_t)mpi_size + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0); const size_t inbrick_lower1 = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size); const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1; std::vector inbrick_lower = {0, inbrick_lower1}; std::vector inbrick_upper = {length[0], inbrick_upper1}; rocfft_brick inbrick = nullptr; rocfft_brick_create(&inbrick, inbrick_lower.data(), inbrick_upper.data(), inbrick_stride.data(), inbrick_lower.size(), 0); rocfft_field_add_brick(infield, inbrick); rocfft_brick_destroy(inbrick); inbrick = nullptr; const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex); std::vector> host_in(length[0] * inbrick_length1); for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0) { for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1) { const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0] + (idx1 - inbrick_lower[1]) * inbrick_stride[1]; host_in[pos] = std::complex(idx0, idx1); } } // Serialize output: for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "in-brick rank " << irank; std::cout << "\n\tlower indices:"; for(const auto val : inbrick_lower) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : inbrick_upper) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : inbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0) { for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1) { const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0] + (idx1 - inbrick_lower[1]) * inbrick_stride[1]; std::cout << host_in[pos] << " "; } std::cout << "\n"; } } MPI_Barrier(mpi_comm); } hiprc = hipMalloc(&gpu_in[0], memSize); if(hiprc != hipSuccess) throw std::runtime_error("inbrick hipMalloc failed"); hiprc = hipMemcpy(gpu_in[0], host_in.data(), memSize, hipMemcpyHostToDevice); if(hiprc != hipSuccess) throw std::runtime_error("inbrick hipMemcpy failed"); rocfft_plan_description_add_infield(description, infield); fftrc = rocfft_field_destroy(infield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy infield"); } if(mpi_rank == 0) { std::cout << "output data decomposition:\n"; } std::vector gpu_out = {nullptr}; std::vector outbrick_lower; std::vector outbrick_upper; std::vector outbrick_stride = {1, length[1]}; { const size_t outbrick_length1 = length[1] / (size_t)mpi_size + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0); const size_t outbrick_lower1 = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size); const size_t outbrick_upper1 = outbrick_lower1 + outbrick_length1; outbrick_lower = {0, outbrick_lower1}; outbrick_upper = {length[0], outbrick_upper1}; const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex); for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "out-brick rank " << irank; std::cout << "\n\tlower indices:"; for(const auto val : outbrick_lower) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : outbrick_upper) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : outbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; } MPI_Barrier(mpi_comm); } rocfft_field outfield = nullptr; rocfft_field_create(&outfield); rocfft_brick outbrick = nullptr; outbrick_lower = {0, outbrick_lower1}; outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1}; rocfft_brick_create(&outbrick, outbrick_lower.data(), outbrick_upper.data(), outbrick_stride.data(), outbrick_lower.size(), 0); rocfft_field_add_brick(outfield, outbrick); rocfft_brick_destroy(outbrick); outbrick = nullptr; hiprc = hipMalloc(&gpu_out[0], memSize); if(hiprc != hipSuccess) throw std::runtime_error("outbrick hipMalloc failed"); rocfft_plan_description_add_outfield(description, outfield); fftrc = rocfft_field_destroy(outfield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy outfield"); } // In order still handle non-success return codes without killing all of the MPI processes, we // put object creation in a try/catch block and destroy non-nullptr objects. // Serialize output: for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "rank " << irank << "\n"; std::cout << "input "; for(const auto& b : gpu_in) std::cout << " " << b; std::cout << "\n"; std::cout << "output "; for(const auto& b : gpu_out) std::cout << " " << b; std::cout << "\n"; } MPI_Barrier(mpi_comm); } fftrc = rocfft_status_sync(fftrc, mpi_comm); hiprc = hip_status_sync(hiprc, mpi_comm); if(mpi_rank == 0) { if(fftrc == rocfft_status_success && hiprc == hipSuccess) { std::cout << "so far so good, trying to make a plan....\n"; } else { std::cout << "failure: will not make a plan....\n"; } } // Create a multi-process plan: rocfft_plan gpu_plan = nullptr; if(fftrc == rocfft_status_success && hiprc == hipSuccess) { fftrc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms description); // Description } fftrc = rocfft_status_sync(fftrc, mpi_comm); if(mpi_rank == 0) { if(fftrc == rocfft_status_success) { std::cout << "so far so good, we have a plan....\n"; } else { std::cout << "failure: we do not have a plan....\n"; } } // Execute plan: if(fftrc == rocfft_status_success) { fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), nullptr); } fftrc = rocfft_status_sync(fftrc, mpi_comm); if(mpi_rank == 0) { if(fftrc == rocfft_status_success) { std::cout << "The FFT was succesful....\n"; } else { std::cout << "The FFT execution failed....\n"; } } // Output the data: for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "out brick rank " << irank << "\n"; const size_t outcount = (outbrick_upper[0] - outbrick_lower[0]) * (outbrick_upper[1] - outbrick_lower[1]); std::vector> host_out(outcount); hiprc = hipMemcpy(host_out.data(), gpu_out[0], outcount * sizeof(std::complex), hipMemcpyDeviceToHost); if(hiprc != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(auto idx0 = outbrick_lower[0]; idx0 < outbrick_upper[0]; ++idx0) { for(auto idx1 = outbrick_lower[1]; idx1 < outbrick_upper[1]; ++idx1) { const auto pos = (idx0 - outbrick_lower[0]) * outbrick_stride[0] + (idx1 - outbrick_lower[1]) * outbrick_stride[1]; std::cout << host_out[pos] << " "; } std::cout << "\n"; } } MPI_Barrier(mpi_comm); } // Cleanup anything plan-generation structs (that aren't null pointers): if(description != nullptr) { if(rocfft_plan_description_destroy(description) != rocfft_status_success) { std::cerr << "description descruction failed\n"; } else { description = nullptr; } } // Clean up the plan and rocfft: try { if(gpu_plan != nullptr) { if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; } } catch(const std::exception&) { std::cerr << "rank " << mpi_rank << " plan destroy failed\n"; } for(auto& buf : gpu_in) { if(buf != nullptr) { hiprc = hipFree(buf); if(hiprc != hipSuccess) std::cerr << "hipFree failed\n"; buf = nullptr; } } for(auto& buf : gpu_out) { if(buf != nullptr) { hiprc = hipFree(buf); if(hiprc != hipSuccess) std::cerr << "hipFree failed\n"; buf = nullptr; } } fftrc = rocfft_cleanup(); MPI_Finalize(); return 0; } rocFFT-rocm-6.4.3/clients/samples/multi_gpu/000077500000000000000000000000001501537341300207215ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/samples/multi_gpu/CMakeLists.txt000066400000000000000000000076641501537341300234760ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-multi_gpu LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() set( sample_list mgpu_complex) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) if( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) endif() target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) rocFFT-rocm-6.4.3/clients/samples/multi_gpu/mgpu_complex.cpp000066400000000000000000000323451501537341300241330ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include "../../../shared/CLI11.hpp" #include "rocfft/rocfft.h" #include #include #include int main(int argc, char* argv[]) { std::cout << "rocfft single-node multi-gpu complex-to-complex 3D FFT example\n"; // Length of transform, first dimension must be greather than number of GPU devices std::vector length = {8, 8}; // Gpu device ids: std::vector devices = {0, 1}; // Command-line options: CLI::App app{"rocfft sample command line options"}; app.add_option("--length", length, "2-D FFT size (eg: --length 256 256)"); app.add_option( "--devices", devices, "List of devices to use separated by spaces (eg: --devices 1 3)"); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } int deviceCount = devices.size(); std::cout << "Using " << deviceCount << " device(s)\n"; int nDevices; (void)hipGetDeviceCount(&nDevices); std::cout << "Number of available GPUs: " << nDevices << " \n"; if(nDevices <= static_cast(*std::max_element(devices.begin(), devices.end()))) throw std::runtime_error("device ID greater than number of available devices"); // Placeness for the transform auto fftrc = rocfft_status_success; fftrc = rocfft_setup(); if(fftrc != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = rocfft_placement_notinplace; // Direction of transform const rocfft_transform_type direction = rocfft_transform_type_complex_forward; rocfft_plan_description description = nullptr; rocfft_plan_description_create(&description); // Do not set stride information via the descriptor, they are to be defined during field // creation below rocfft_plan_description_set_data_layout(description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, 0, nullptr, 0, 0, nullptr, 0); auto hiprc = hipSuccess; std::cout << "input data decomposition:\n"; std::vector gpu_in(devices.size()); { // Row-major stride for brick data layout in memory std::vector inbrick_stride = {1, length[1]}; rocfft_field infield = nullptr; rocfft_field_create(&infield); std::vector> inbrick_lower(gpu_in.size()); std::vector> inbrick_upper(gpu_in.size()); for(size_t idx = 0; idx < gpu_in.size(); ++idx) { const size_t inbrick_length1 = length[1] / gpu_in.size() + (idx < length[1] % gpu_in.size() ? 1 : 0); const size_t inbrick_lower1 = idx * (length[1] / gpu_in.size()) + std::min(idx, length[1] % gpu_in.size()); const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1; inbrick_lower[idx] = {0, inbrick_lower1}; inbrick_upper[idx] = {length[0], inbrick_upper1}; rocfft_brick inbrick = nullptr; rocfft_brick_create(&inbrick, inbrick_lower[idx].data(), inbrick_upper[idx].data(), inbrick_stride.data(), inbrick_lower[idx].size(), devices[idx]); rocfft_field_add_brick(infield, inbrick); rocfft_brick_destroy(inbrick); inbrick = nullptr; const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex); std::cout << "in-brick " << idx; std::cout << "\n\tlower indices:"; for(const auto val : inbrick_lower[idx]) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : inbrick_upper[idx]) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : inbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; hiprc = hipSetDevice(devices[idx]); if(hiprc != hipSuccess) throw std::runtime_error("hipSetDevice failed"); hiprc = hipMalloc(&gpu_in[idx], memSize); if(hiprc != hipSuccess) throw std::runtime_error("hipMalloc failed"); std::vector> host_in(length[0] * inbrick_length1); for(auto idx0 = inbrick_lower[idx][0]; idx0 < inbrick_upper[idx][0]; ++idx0) { for(auto idx1 = inbrick_lower[idx][1]; idx1 < inbrick_upper[idx][1]; ++idx1) { const auto pos = (idx0 - inbrick_lower[idx][0]) * inbrick_stride[0] + (idx1 - inbrick_lower[idx][1]) * inbrick_stride[1]; host_in[pos] = std::complex(idx0, idx1); std::cout << host_in[pos] << " "; } std::cout << "\n"; } hiprc = hipMemcpy(gpu_in[idx], host_in.data(), memSize, hipMemcpyHostToDevice); if(hiprc != hipSuccess) throw std::runtime_error("hipMemcpy failed"); } rocfft_plan_description_add_infield(description, infield); fftrc = rocfft_field_destroy(infield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy infield"); } std::cout << "output data decomposition:\n"; std::vector gpu_out(devices.size()); std::vector> outbrick_lower(gpu_out.size()); std::vector> outbrick_upper(gpu_out.size()); std::vector outbrick_stride = {1, length[1]}; { rocfft_field outfield = nullptr; rocfft_field_create(&outfield); for(size_t idx = 0; idx < gpu_out.size(); ++idx) { const size_t outbrick_length1 = length[1] / gpu_out.size() + (idx < length[1] % gpu_in.size() ? 1 : 0); const size_t outbrick_lower1 = idx * (length[1] / gpu_out.size()) + std::min(idx, length[1] % gpu_out.size()); rocfft_brick outbrick = nullptr; outbrick_lower[idx] = {0, outbrick_lower1}; outbrick_upper[idx] = {length[0], outbrick_lower1 + outbrick_length1}; rocfft_brick_create(&outbrick, outbrick_lower[idx].data(), outbrick_upper[idx].data(), outbrick_stride.data(), outbrick_lower[idx].size(), devices[idx]); rocfft_field_add_brick(outfield, outbrick); rocfft_brick_destroy(outbrick); outbrick = nullptr; const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex); std::cout << "out-brick " << idx; std::cout << "\n\tlower indices:"; for(const auto val : outbrick_lower[idx]) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : outbrick_upper[idx]) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : outbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; (void)hipSetDevice(devices[idx]); if(hipMalloc(&gpu_out[idx], memSize) != hipSuccess) throw std::runtime_error("hipMalloc failed"); } rocfft_plan_description_add_outfield(description, outfield); fftrc = rocfft_field_destroy(outfield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy outfield"); } // Create a multi-gpu plan: (void)hipSetDevice(devices[0]); rocfft_plan gpu_plan = nullptr; fftrc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms description); // Description if(fftrc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get execution information and allocate work buffer rocfft_execution_info planinfo = nullptr; size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(gpu_plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&planinfo) != rocfft_status_success) throw std::runtime_error("failed to create execution info"); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed"); if(rocfft_execution_info_set_work_buffer(planinfo, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan: fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), planinfo); if(fftrc != rocfft_status_success) throw std::runtime_error("failed to execute."); // Output the data. for(size_t idx = 0; idx < gpu_out.size(); ++idx) { std::cout << "out brick " << idx << "\n"; const auto nbrick = (outbrick_upper[idx][0] - outbrick_lower[idx][0]) * (outbrick_upper[idx][1] - outbrick_lower[idx][1]); std::vector> host_out(nbrick); hiprc = hipMemcpy(host_out.data(), gpu_out[idx], nbrick * sizeof(std::complex), hipMemcpyDeviceToHost); if(hiprc != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(auto idx0 = outbrick_lower[idx][0]; idx0 < outbrick_upper[idx][0]; ++idx0) { for(auto idx1 = outbrick_lower[idx][1]; idx1 < outbrick_upper[idx][1]; ++idx1) { const auto pos = (idx0 - outbrick_lower[idx][0]) * outbrick_stride[0] + (idx1 - outbrick_lower[idx][1]) * outbrick_stride[1]; std::cout << host_out[pos] << " "; } std::cout << "\n"; } } // Destroy plan if(planinfo != nullptr) { if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; } if(rocfft_plan_description_destroy(description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); for(size_t idx = 0; idx < gpu_in.size(); ++idx) { (void)hipFree(gpu_in[idx]); } for(size_t idx = 0; idx < gpu_out.size(); ++idx) { (void)hipFree(gpu_out[idx]); } return 0; } rocFFT-rocm-6.4.3/clients/samples/rocfft/000077500000000000000000000000001501537341300201775ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/samples/rocfft/CMakeLists.txt000066400000000000000000000100361501537341300227370ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-rocfft LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() set( sample_list rocfft_example_complexcomplex rocfft_example_realcomplex rocfft_example_set_stream rocfft_example_callback ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) if( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) endif() target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) rocFFT-rocm-6.4.3/clients/samples/rocfft/examplekernels.h000066400000000000000000000361771501537341300234050ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef EXAMPLEKERNELS_H #define EXAMPLEKERNELS_H #include "../../../shared/data_gen_device.h" #include #include #include // Kernel for initializing 1D real input data on the GPU. __global__ void initrdata1(double* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; x[pos] = idx + 1; } } // Kernel for initializing 2D real input data on the GPU. __global__ void initrdata2( double* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; x[pos] = idx + idy; } } // Kernel for initializing 3D real input data on the GPU. __global__ void initrdata3(double* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; const size_t idz = blockIdx.z * blockDim.z + threadIdx.z; if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; x[pos] = cos(cos(idx + 2)) * sin(idy * idy + 1) / (idz + 1); } } // Kernel for initializing 1D complex data on the GPU. __global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; x[pos].x = 1 + idx; x[pos].y = 1 + idx; } } // Kernel for initializing 2D complex input data on the GPU. __global__ void initcdata2(hipDoubleComplex* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; const auto idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; x[pos].x = idx + 1; x[pos].y = idy + 1; } } // Kernel for initializing 3D complex input data on the GPU. __global__ void initcdata3(hipDoubleComplex* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; const size_t idz = blockIdx.z * blockDim.z + threadIdx.z; if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; x[pos].x = idx + 10.0 * idz + 1; x[pos].y = idy + 10; } } // Helper function for determining grid dimensions template Tint1 ceildiv(const Tint1 nominator, const Tint2 denominator) { return (nominator + denominator - 1) / denominator; } // The following functions call the above kernels to initalize the input data for the transform. void initcomplex_cm(const std::vector& length_cm, const std::vector& stride_cm, void* gpu_in) { size_t blockSize = DATA_GEN_THREADS; const dim3 blockdim(blockSize); switch(length_cm.size()) { case 1: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x)); hipLaunchKernelGGL(initcdata1, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], stride_cm[0]); break; } case 2: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y)); hipLaunchKernelGGL(initcdata2, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], stride_cm[1]); break; } case 3: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y), ceildiv(length_cm[2], blockdim.z)); hipLaunchKernelGGL(initcdata3, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], length_cm[2], stride_cm[0], stride_cm[1], stride_cm[2]); break; } default: std::cout << "invalid dimension!\n"; exit(1); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("init_complex_data kernel launch failure: " + std::string(hipGetErrorName(err))); } // Initialize the real input buffer where the data has lengths given in length and stride given in // stride. The device buffer is assumed to have been allocated. void initreal_cm(const std::vector& length_cm, const std::vector& stride_cm, void* gpu_in) { size_t blockSize = DATA_GEN_THREADS; const dim3 blockdim(blockSize); switch(length_cm.size()) { case 1: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x)); hipLaunchKernelGGL( initrdata1, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], stride_cm[0]); break; } case 2: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y)); hipLaunchKernelGGL(initrdata2, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], stride_cm[1]); break; } case 3: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y), ceildiv(length_cm[2], blockdim.z)); hipLaunchKernelGGL(initrdata3, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], length_cm[1], length_cm[2], stride_cm[0], stride_cm[1], stride_cm[2]); break; } default: std::cout << "invalid dimension!\n"; exit(1); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("init_real_data kernel launch failure: " + std::string(hipGetErrorName(err))); } // Imposes Hermitian symmetry for the input device buffer. // Note: input parameters are in column-major ordering. void impose_hermitian_symmetry_cm(const std::vector& length, const std::vector& ilength, const std::vector& stride, void* gpu_in) { size_t batch = 1; size_t dist = 1; size_t blockSize = DATA_GEN_THREADS; auto inputDim = length.size(); // Launch impose_hermitian_symmetry kernels. // NOTE: input parameters must be in row-major // ordering for these kernels. switch(inputDim) { case 1: { const auto gridDim = dim3(DivRoundingUp(batch, blockSize)); const auto blockDim = dim3(blockSize); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel, gridDim, blockDim, 0, 0, (hipDoubleComplex*)gpu_in, length[0], stride[0], dist, batch, length[0] % 2 == 0); break; } case 2: { const auto gridDim = dim3(DivRoundingUp(batch, blockSize), DivRoundingUp((length[1] + 1) / 2 - 1, blockSize)); const auto blockDim = dim3(blockSize, blockSize); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel, gridDim, blockDim, 0, 0, (hipDoubleComplex*)gpu_in, length[1], length[0], stride[1], stride[0], dist, batch, (ilength[1] + 1) / 2 - 1, length[1] % 2 == 0, length[0] % 2 == 0); break; } case 3: { const auto gridDim = dim3(DivRoundingUp(batch, blockSize), DivRoundingUp((length[2] + 1) / 2 - 1, blockSize), DivRoundingUp(length[1] - 1, blockSize)); const auto blockDim = dim3(blockSize, blockSize, blockSize); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel, gridDim, blockDim, 0, 0, (hipDoubleComplex*)gpu_in, length[2], length[1], length[0], stride[2], stride[1], stride[0], dist, batch, (ilength[2] + 1) / 2 - 1, ilength[1] - 1, (ilength[1] + 1) / 2 - 1, length[2] % 2 == 0, length[1] % 2 == 0, length[0] % 2 == 0); break; } default: throw std::runtime_error("Invalid dimension"); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("impose_hermitian_symmetry_interleaved kernel launch failure: " + std::string(hipGetErrorName(err))); } // Initialize the Hermitian complex input buffer where the data has lengths given in length, the // transform has lengths given in length and stride given in stride. The device buffer is assumed // to have been allocated. void init_hermitiancomplex_cm(const std::vector& length, const std::vector& ilength, const std::vector& stride, void* gpu_in) { size_t blockSize = 256; const dim3 blockdim(blockSize); switch(length.size()) { case 1: { const dim3 griddim(ceildiv(ilength[0], blockSize)); hipLaunchKernelGGL( initcdata1, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]); break; } case 2: { const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y)); hipLaunchKernelGGL(initcdata2, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], stride[0], stride[1]); break; } case 3: { const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y), ceildiv(ilength[2], blockdim.z)); hipLaunchKernelGGL(initcdata3, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], ilength[2], stride[0], stride[1], stride[2]); break; } default: throw std::runtime_error("Invalid dimension"); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("init_complex_data kernel launch failure: " + std::string(hipGetErrorName(err))); impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in); } #endif /* EXAMPLEKERNELS_H */ rocFFT-rocm-6.4.3/clients/samples/rocfft/exampleutils.h000066400000000000000000000136441501537341300230740ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef EXAMPLEUTILS_H #define EXAMPLEUTILS_H std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c) { stream << "(" << c.x << "," << c.y << ")"; return stream; } // Increment the index (column-major) for looping over arbitrary dimensional loops with // dimensions length. template bool increment_cm(std::vector& index, const std::vector& length) { for(unsigned int idim = 0; idim < length.size(); ++idim) { if(index[idim] < length[idim]) { if(++index[idim] == length[idim]) { index[idim] = 0; continue; } break; } } // End the loop when we get back to the start: return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; }); } // Output a formatted general-dimensional array with given length and stride in batches // separated by dist, in column-major order. template void printbuffer_cm(const std::vector& data, const std::vector& length, const std::vector& stride, const size_t nbatch, const size_t dist) { for(size_t b = 0; b < nbatch; b++) { std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist); assert(i >= 0); assert(i < data.size()); std::cout << data[i] << " "; for(size_t idx = 0; idx < index.size(); ++idx) { if(index[idx] == (length[idx] - 1)) { std::cout << "\n"; } else { break; } } } while(increment_cm(index, length)); std::cout << std::endl; } } // Check that an multi-dimensional array of complex values with dimensions length // and straide stride, with nbatch copies separated by dist is Hermitian-symmetric. // Column-major version. template bool check_symmetry_cm(const std::vector& data, const std::vector& length_cm, const std::vector& stride_cm, const size_t nbatch, const size_t dist, const bool verbose = true) { bool issymmetric = true; for(size_t b = 0; b < nbatch; b++) { std::vector index(length_cm.size()); std::fill(index.begin(), index.end(), 0); do { bool skip = false; std::vector negindex(index.size()); for(size_t idx = 0; idx < index.size(); ++idx) { if(index[0] > length_cm[0] / 2) { skip = true; break; } negindex[idx] = (length_cm[idx] - index[idx]) % length_cm[idx]; } if(negindex[0] > length_cm[0] / 2) { skip = true; } if(!skip) { const auto i = std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist); const auto j = std::inner_product( negindex.begin(), negindex.end(), stride_cm.begin(), b * dist); if((data[i].x != data[j].x) or (data[i].y != -data[j].y)) { if(verbose) { std::cout << "("; std::string separator; for(auto val : index) { std::cout << separator << val; separator = ","; } std::cout << ")->"; std::cout << i << "\t"; std::cout << "("; separator = ""; for(auto val : negindex) { std::cout << separator << val; separator = ","; } std::cout << ")->"; std::cout << j << ":\t"; std::cout << data[i] << " " << data[j]; std::cout << "\tnot conjugate!" << std::endl; } issymmetric = false; } } } while(increment_cm(index, length_cm)); } return issymmetric; } #endif /* EXAMPLEUTILS_H */ rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_callback.cpp000066400000000000000000000156701501537341300255260ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include #include #include #include // example of using load/store callbacks with rocfft struct load_cbdata { double2* filter; double scale; }; __device__ double2 load_callback(double2* input, size_t offset, void* cbdata, void* sharedMem) { auto data = static_cast(cbdata); // multiply each element by filter element and scale return hipCmul(hipCmul(input[offset], data->filter[offset]), make_hipDoubleComplex(data->scale, data->scale)); } __device__ auto load_callback_dev = load_callback; int main() { const size_t N = 8; std::vector cx(N), filter(N); // initialize data and filter for(size_t i = 0; i < N; i++) { cx[i].x = i; cx[i].y = i; filter[i].x = rand() / static_cast(RAND_MAX); filter[i].y = 0; } // rocfft gpu compute // ================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2 *x, *filter_dev; // create buffers if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(hipMalloc(&filter_dev, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(work_buf_size) { if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Prepare callback load_cbdata cbdata_host; cbdata_host.filter = filter_dev; cbdata_host.scale = 1.0 / static_cast(N); void* cbdata_dev; if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess) throw std::runtime_error("hipMalloc failed."); hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Get a properly-typed host pointer to the device function, as // rocfft_execution_info_set_load_callback expects void*. void* cbptr_host = nullptr; hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpyFromSymbol failed."); // set callback if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_load_callback failed."); // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(cbdata_dev) != hipSuccess) throw std::runtime_error("hipFree failed."); if(hipFree(filter_dev) != hipSuccess) throw std::runtime_error("hipFree failed."); if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_complexcomplex.cpp000066400000000000000000000245411501537341300270260ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include #include "../../../shared/CLI11.hpp" #include "examplekernels.h" #include "exampleutils.h" #include int main(int argc, char* argv[]) { std::cout << "rocfft double-precision complex-to-complex transform\n" << std::endl; // Length of transform: std::vector length = {8}; // Gpu device id: size_t deviceId = 0; // Command-line options: CLI::App app{"rocfft sample command line options"}; app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); CLI::Option* opt_outofplace = app.add_flag("-o, --outofplace", "Perform an out-of-place transform"); CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform"); app.add_option( "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)"); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // Placeness for the transform if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; // Direction of transform const rocfft_transform_type direction = *opt_inverse ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse; // Set up the strides and buffer size for the input: std::vector istride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { istride.push_back(length[i - 1] * istride[i - 1]); } const size_t isize = length[length.size() - 1] * istride[istride.size() - 1]; // Set up the strides and buffer size for the output: std::vector ostride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { ostride.push_back(length[i - 1] * ostride[i - 1]); } const size_t osize = length[length.size() - 1] * ostride[ostride.size() - 1]; // Print information about the transform: std::cout << "direction: "; if(direction == rocfft_transform_type_complex_forward) std::cout << "forward\n"; else std::cout << "inverse\n"; std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; if(inplace) std::cout << "in-place transform\n"; else std::cout << "out-of-place transform\n"; std::cout << "deviceID: " << deviceId << "\n"; std::cout << "input strides:"; for(auto i : istride) std::cout << " " << i; std::cout << "\n"; std::cout << "output strides:"; for(auto i : ostride) std::cout << " " << i; std::cout << "\n"; std::cout << "input size: " << isize << "\n"; std::cout << "output size: " << isize << "\n"; std::cout << std::endl; // Set the device: if(hipSetDevice(deviceId) != hipSuccess) throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and allocate data hipDoubleComplex* gpu_in = nullptr; if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Inititalize the data on the device initcomplex_cm(length, istride, gpu_in); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); hipError_t hip_status = hipGetLastError(); if(hip_status != hipSuccess) throw std::runtime_error("device error"); std::cout << "input:\n"; std::vector idata(isize); hip_status = hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(idata, length, istride, 1, isize); // Create the a descrition struct to set data layout: rocfft_plan_description gpu_description = nullptr; // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); rc = rocfft_plan_description_set_data_layout(gpu_description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance ostride.size(), // output stride length ostride.data(), // output stride data 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the plan rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms gpu_description); // Description if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); size_t workbuffersize = 0; rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed."); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to set work buffer."); } // If the transform is out-of-place, allocate the output buffer as well: double2* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex)); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed."); } // Execute the GPU transform: rc = rocfft_execute(gpu_plan, // plan (void**)&gpu_in, // in_buffer (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) throw std::runtime_error("failed to execute."); // Get the output from the device and print to cout: std::cout << "output:\n"; std::vector odata(osize); hip_status = hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(odata, length, istride, 1, isize); // Clean up: free GPU memory: if(hipFree(gpu_in) != hipSuccess) throw std::runtime_error("hipFree failed."); if(!inplace) { if(hipFree(gpu_out) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(wbuffer != nullptr) { if(hipFree(wbuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); gpu_description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_realcomplex.cpp000066400000000000000000000277311501537341300263060ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include #include "../../../shared/CLI11.hpp" #include "examplekernels.h" #include "exampleutils.h" #include int main(int argc, char* argv[]) { std::cout << "rocfft double-precision real/complex transform\n" << std::endl; // Length of transform: std::vector length = {8}; // Gpu device id: size_t deviceId = 0; // Command-line options: CLI::App app{"rocfft sample command line options"}; app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); CLI::Option* opt_outofplace = app.add_flag("-o, --outofplace", "Perform an out-of-place transform"); CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform"); app.add_option( "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)"); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // Placeness for the transform if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; // Direction of transform const rocfft_transform_type direction = *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward; const bool forward = direction == rocfft_transform_type_real_forward; // Set up the strides and buffer size for the real values: std::vector rstride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { // In-place transforms need space for two extra real values in the contiguous // direction. auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1]; rstride.push_back(val); } // NB: not tight, but hey const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1]; std::vector rdata(real_size); // host storage // The complex data length is half + 1 of the real data length in the contiguous // dimensions. Since rocFFT is column-major, this is the first index. std::vector clength = length; clength[0] = clength[0] / 2 + 1; std::vector cstride = {1}; for(unsigned int i = 1; i < clength.size(); ++i) { cstride.push_back(clength[i - 1] * cstride[i - 1]); } const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1]; std::vector cdata(complex_size); // host storage // Based on the direction, we set the input and output parameters appropriately. const size_t isize = forward ? real_size : complex_size; const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex)); const std::vector ilength = forward ? length : clength; const std::vector istride = forward ? rstride : cstride; const size_t osize = forward ? complex_size : real_size; const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double)); const std::vector olength = forward ? clength : length; const std::vector ostride = forward ? cstride : rstride; // Print information about the transform: std::cout << "direction: "; if(forward) std::cout << "forward\n"; else std::cout << "inverse\n"; std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; if(inplace) std::cout << "in-place transform\n"; else std::cout << "out-of-place transform\n"; std::cout << "deviceID: " << deviceId << "\n"; std::cout << "input length:"; for(auto i : ilength) std::cout << " " << i; std::cout << "\n"; std::cout << "input buffer stride:"; for(auto i : istride) std::cout << " " << i; std::cout << "\n"; std::cout << "input buffer size: " << ibytes << "\n"; std::cout << "output length:"; for(auto i : olength) std::cout << " " << i; std::cout << "\n"; std::cout << "output buffer stride:"; for(auto i : ostride) std::cout << " " << i; std::cout << "\n"; std::cout << "output buffer size: " << obytes << "\n"; std::cout << std::endl; // Set the device: if(hipSetDevice(deviceId) != hipSuccess) throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and initialize data // Kernels are provided in examplekernels.h void* gpu_in = nullptr; hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes); if(hip_status != hipSuccess) throw std::runtime_error("device error"); if(forward) { initreal_cm(length, istride, gpu_in); } else { init_hermitiancomplex_cm(length, ilength, istride, gpu_in); } // Print the input: std::cout << "input:\n"; if(forward) { hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, ilength, istride, 1, isize); } else { hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, ilength, istride, 1, isize); // Check that the buffer is Hermitian symmetric: check_symmetry_cm(cdata, length, istride, 1, isize); } // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_status_success; // Create the a descrition struct to set data layout: rocfft_plan_description gpu_description = nullptr; rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); rc = rocfft_plan_description_set_data_layout( gpu_description, // input data format: forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved, // output data format: forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real, nullptr, nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance ostride.size(), // output stride length ostride.data(), // output stride data 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the FFT plan: rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms gpu_description); // Description if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); size_t workbuffersize = 0; rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed"); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to set work buffer"); } // If the transform is out-of-place, allocate the output buffer as well: void* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, obytes); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed"); } // Execute the GPU transform: rc = rocfft_execute(gpu_plan, // plan (void**)&gpu_in, // in_buffer (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) throw std::runtime_error("failed to execute"); // Get the output from the device and print to cout: std::cout << "output:\n"; if(forward) { hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, olength, ostride, 1, osize); } else { hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, olength, ostride, 1, osize); } // Clean up: free GPU memory: if(hipFree(gpu_in) != hipSuccess) throw std::runtime_error("hipFree failed."); if(!inplace) { if(hipFree(gpu_out) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(wbuffer != nullptr) { if(hipFree(wbuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); gpu_description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; rocfft_cleanup(); return 0; } rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_set_stream.cpp000066400000000000000000000126471501537341300261410ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "rocfft/rocfft.h" #include #include #include #include #include struct fft_fixture_t { std::vector cpu_buf; double2* gpu_buf = nullptr; hipStream_t stream = nullptr; rocfft_execution_info info = nullptr; rocfft_plan plan = nullptr; }; int main(int argc, char* argv[]) { std::cout << "rocfft example of 2 inplace transforms with 2 streams.\n" << std::endl; size_t length = 8; size_t total_bytes = length * sizeof(double2); hipError_t hip_status; rocfft_status fft_status; fft_fixture_t ffts[2]; /// preparation if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); for(auto& it : ffts) { // create cpu buffer it.cpu_buf.resize(length); // init cpu buffer... // create gpu buffer if(hipMalloc(&(it.gpu_buf), total_bytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // copy host to device if(hipMemcpy(it.gpu_buf, it.cpu_buf.data(), total_bytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // create stream if(hipStreamCreate(&(it.stream)) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); // create execution info fft_status = rocfft_execution_info_create(&(it.info)); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); // set stream // NOTE: The stream must be of type hipStream_t. // It is an error to pass the address of a hipStream_t object. fft_status = rocfft_execution_info_set_stream(it.info, it.stream); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_stream failed."); // create plan fft_status = rocfft_plan_create(&it.plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); size_t work_buf_size = 0; fft_status = rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); assert(work_buf_size == 0); // simple 1D inplace fft doesn't need extra working buffer } /// execution for(auto& it : ffts) { fft_status = rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); } /// wait and copy back for(auto& it : ffts) { if(hipStreamSynchronize(it.stream) != hipSuccess) throw std::runtime_error("hipStreamSynchronize failed."); hip_status = hipMemcpy(it.cpu_buf.data(), it.gpu_buf, total_bytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); } /// clean up for(auto& it : ffts) { fft_status = rocfft_plan_destroy(it.plan); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); fft_status = rocfft_execution_info_destroy(it.info); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); if(hipStreamDestroy(it.stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); if(hipFree(it.gpu_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-6.4.3/clients/tests/000077500000000000000000000000001501537341300164125ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/tests/CMakeLists.txt000066400000000000000000000305321501537341300211550ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-tests LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( NOT ROCM_FOUND ) find_package( ROCM 0.7.3 REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) set( rocfft-test_source gtest_main.cpp rocfft_accuracy_test.cpp bitwise_repro/bitwise_repro_test.cpp accuracy_test.cpp accuracy_test_1D.cpp accuracy_test_2D.cpp accuracy_test_3D.cpp accuracy_test_adhoc.cpp accuracy_test_emulation.cpp accuracy_test_callback.cpp accuracy_test_checkstride.cpp multithread_test.cpp multi_device_test.cpp hermitian_test.cpp hipGraph_test.cpp callback_change_type.cpp default_callbacks_test.cpp unit_test.cpp buffer_hash_test.cpp validate_length_stride.cpp random.cpp ../../shared/array_validator.cpp ) add_executable( rocfft-test ${rocfft-test_source} ${rocfft-test_includes} ) add_executable( rtc_helper_crash rtc_helper_crash.cpp ) find_package( Boost REQUIRED ) set( Boost_DEBUG ON ) set( Boost_DETAILED_FAILURE_MSG ON ) option( BUILD_FFTW "Download and build FFTW" OFF ) # look for installed FFTW if we weren't asked to build it if( NOT BUILD_FFTW ) find_package( FFTW 3.0 MODULE COMPONENTS FLOAT DOUBLE ) endif() include( ExternalProject ) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) # use extract timestamp for fetched files instead of timestamps in the archive cmake_policy(SET CMP0135 NEW) endif() # also try to build FFTW if FFTW isn't present if( BUILD_FFTW OR NOT FFTW_FOUND ) set(FFTW_LIBRARIES_DOUBLE ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3_threads${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3${CMAKE_SHARED_LIBRARY_SUFFIX}) set(FFTW_LIBRARIES_SINGLE ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f_threads${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f${CMAKE_SHARED_LIBRARY_SUFFIX}) set(FFTW_CMAKE_ARGS_COMMON -DDISABLE_FORTRAN=ON -DENABLE_AVX2=ON -DENABLE_THREADS=ON -DBUILD_SHARED_LIBS=ON -DBUILD_TESTS=OFF -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}) set(FFTW_SRC_URL http://www.fftw.org/fftw-3.3.9.tar.gz CACHE STRING "Location of FFTW source code") set(FFTW_SRC_SHA256 bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d CACHE STRING "SHA256 hash of FFTW source code") # build double-precision FFTW ExternalProject_Add(fftw_double URL ${FFTW_SRC_URL} URL_HASH SHA256=${FFTW_SRC_SHA256} SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} INSTALL_COMMAND "" BUILD_BYPRODUCTS ${FFTW_LIBRARIES_DOUBLE}) ExternalProject_Get_Property( fftw_double source_dir binary_dir ) # also build single-precision fftw from the same source dir ExternalProject_Add(fftw_single DOWNLOAD_COMMAND "" SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} -DENABLE_FLOAT=ON INSTALL_COMMAND "" BUILD_BYPRODUCTS ${FFTW_LIBRARIES_SINGLE} DEPENDS fftw_double) ExternalProject_Get_Property( fftw_single source_dir binary_dir ) set(FFTW_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}/src/fftw/api) set(FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ${FFTW_LIBRARIES_SINGLE}) # FFTW we build is always threaded set( FFTW_MULTITHREAD TRUE ) add_dependencies( rocfft-test fftw_double fftw_single ) rocm_install( FILES ${FFTW_LIBRARIES} DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw COMPONENT clients-common ) else() include_directories(${FFTW_INCLUDE_DIRS}) endif() set( rocfft-test_include_dirs $ $ $ ${ROCM_CLANG_ROOT}/include ) set( rocfft-test_link_libs ${FFTW_LIBRARIES} ) include( ../cmake/build-gtest.cmake ) if( BUILD_GTEST OR NOT GTEST_FOUND ) add_dependencies( rocfft-test gtest ) list( APPEND rocfft-test_include_dirs ${GTEST_INCLUDE_DIRS} ) list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) else() list( APPEND rocfft-test_include_dirs $ ) list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) endif() target_compile_options( rocfft-test PRIVATE ${WARNING_FLAGS} -Wno-cpp ) target_include_directories( rocfft-test PRIVATE ${rocfft-test_include_dirs} ) if( NOT BUILD_SHARED_LIBS ) list(APPEND rocfft-test_link_libs ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS}) endif() if( NOT ROCFFT_BUILD_SCOPE ) find_package(SQLite3 REQUIRED) set( ROCFFT_SQLITE_LIB SQLite::SQLite3) endif() target_link_libraries( rocfft-test PRIVATE hip::device roc::rocfft ${ROCFFT_SQLITE_LIB} ${rocfft-test_link_libs} ) if ( USE_HIPRAND ) target_link_libraries( rocfft-test PRIVATE hip::hiprand ) endif() if( ROCFFT_MPI_ENABLE ) target_link_libraries( rocfft-test PRIVATE MPI::MPI_CXX ) add_compile_definitions( ROCFFT_MPI_ENABLE ) if ( ROCFFT_CRAY_MPI_ENABLE ) target_link_libraries( rocfft-test PRIVATE "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( rocfft-test PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) endif() endif() include( ../../cmake/std-filesystem.cmake ) target_link_std_experimental_filesystem( rocfft-test ) if( USE_CUDA ) target_include_directories( rocfft-test PRIVATE $ $ ) target_compile_definitions( rocfft-test PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( rocfft-test PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) include( ../../cmake/sqlite.cmake ) target_link_libraries( rocfft-test PUBLIC ${ROCFFT_SQLITE_LIB} ) target_include_directories( rocfft-test PRIVATE ${sqlite_local_SOURCE_DIR} ) set_property( TARGET rocfft-test APPEND PROPERTY LINK_LIBRARIES ${ROCFFT_SQLITE_LIB} ) option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) if( BUILD_CLIENTS_TESTS_OPENMP ) if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) target_compile_options( rocfft-test PRIVATE -fopenmp ) target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib ) target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include ) else() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options( rocfft-test PRIVATE -fopenmp=libomp ) target_link_options( rocfft-test PRIVATE -fopenmp=libomp ) endif() endif() endif() if(FFTW_MULTITHREAD) target_compile_options( rocfft-test PRIVATE -DFFTW_MULTITHREAD ) endif( ) set_target_properties( rocfft-test PROPERTIES CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../bin" ) else() set( TESTS_OUT_DIR "/bin" ) endif() string( CONCAT TESTS_OUT_DIR "${PROJECT_BINARY_DIR}" ${TESTS_OUT_DIR} ) set_target_properties(rocfft-test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) set_target_properties(rtc_helper_crash PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS rocfft-test rtc_helper_crash COMPONENT tests) if (WIN32) # Ensure tests run with HIP DLLs and not anything the driver owns # in system32. Libraries like amdhip64.dll are also in the HIP # runtime, and we need run with those. But the only way to make a # same-named DLL override something in system32 is to have it next # to the executable. So copy them in. file( GLOB third_party_dlls LIST_DIRECTORIES OFF CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll C:/Windows/System32/libomp140*.dll ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ ) endforeach( file_i ) endif() if( ROCFFT_MPI_ENABLE ) # normal and dynamic-loading MPI worker processes foreach(worker rocfft_mpi_worker dyna_rocfft_mpi_worker) add_executable( ${worker} rocfft_mpi_worker.cpp ) target_compile_options( ${worker} PRIVATE -fopenmp ) target_include_directories( ${worker} PRIVATE ${CMAKE_BINARY_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/ ${MPI_C_INCLUDE_PATH} $ ) target_compile_options( ${worker} PRIVATE ${WARNING_FLAGS} ) if ( ROCFFT_CRAY_MPI_ENABLE ) target_link_libraries( ${worker} -fopenmp hip::hiprand hip::device MPI::MPI_CXX ${FFTW_LIBRARIES} "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( ${worker} PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) else() target_link_libraries( ${worker} -fopenmp hip::hiprand hip::device MPI::MPI_CXX ${FFTW_LIBRARIES} ) endif() set_target_properties(${worker} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS ${worker} COMPONENT tests) endforeach() # link normal MPI worker against rocFFT target_link_libraries( rocfft_mpi_worker roc::rocfft ) # dyna worker only needs to dynamically load libraries target_compile_definitions( dyna_rocfft_mpi_worker PRIVATE ROCFFT_DYNA_MPI_WORKER ) target_link_libraries( dyna_rocfft_mpi_worker ${CMAKE_DL_LIBS} ) endif() rocFFT-rocm-6.4.3/clients/tests/accuracy_test.cpp000066400000000000000000000624741501537341300217640ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/rocfft_complex.h" #include // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // multiply each element by scalar if(input == testdata->base) return input[offset] * testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_dev_half = load_callback; __device__ auto load_callback_dev_complex_half = load_callback>; __device__ auto load_callback_dev_float = load_callback; __device__ auto load_callback_dev_complex_float = load_callback>; __device__ auto load_callback_dev_double = load_callback; __device__ auto load_callback_dev_complex_double = load_callback>; // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback_round_trip_inverse(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // subtract each element by scalar if(input == testdata->base) return input[offset] - testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_round_trip_inverse_dev_half = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_half = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_float = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_float = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_double = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_double = load_callback_round_trip_inverse>; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { void* load_callback_host = nullptr; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } default: // planar is unsupported for now return load_callback_host; } } template __host__ __device__ static void store_callback(Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element + testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_dev_half = store_callback; __device__ auto store_callback_dev_complex_half = store_callback>; __device__ auto store_callback_dev_float = store_callback; __device__ auto store_callback_dev_complex_float = store_callback>; __device__ auto store_callback_dev_double = store_callback; __device__ auto store_callback_dev_complex_double = store_callback>; template __host__ __device__ static void store_callback_round_trip_inverse( Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element / testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_round_trip_inverse_dev_half = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_half = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_float = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_float = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_double = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_double = store_callback_round_trip_inverse>; void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { void* store_callback_host = nullptr; switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } default: // planar is unsupported for now return store_callback_host; } } // Apply store callback if necessary void apply_store_callback(const fft_params& params, std::vector& output) { if(!params.run_callbacks && params.scale_factor == 1.0) return; callback_test_data cbdata; cbdata.scalar = params.store_cb_scalar; cbdata.base = output.front().data(); switch(params.otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { // planar wouldn't run callbacks, but we could still want scaling switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } // apply load callback if necessary void apply_load_callback(const fft_params& params, std::vector& input) { if(!params.run_callbacks) return; // we're applying callbacks to FFTW input/output which we can // assume is contiguous and non-planar callback_test_data cbdata; cbdata.scalar = params.load_cb_scalar; cbdata.base = input.front().data(); switch(params.itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } rocFFT-rocm-6.4.3/clients/tests/accuracy_test_1D.cpp000066400000000000000000000610701501537341300222770ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "accuracy_tests_range.h" using ::testing::ValuesIn; INSTANTIATE_TEST_SUITE_P(pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_1D}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_1D}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved INSTANTIATE_TEST_SUITE_P( small_1D, accuracy_test, ::testing::ValuesIn(param_generator_base( test_prob, {fft_transform_type_complex_forward, fft_transform_type_real_forward}, generate_lengths({small_1D_sizes()}), {fft_precision_single}, {1}, [](fft_transform_type t, const std::vector& place_range, const bool planar) { if(t == fft_transform_type_complex_forward) return std::vector{ std::make_tuple(t, place_range[0], fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)}; else return std::vector{std::make_tuple( t, place_range[0], fft_array_type_real, fft_array_type_hermitian_interleaved)}; }, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_inplace}, true)), accuracy_test::TestName); // NB: // We have known non-unit strides issues for 1D: // - C2C middle size(for instance, single precision, 8192) // - C2C large size(for instance, single precision, 524288) // We need to fix non-unit strides first, and then address non-unit strides + batch tests. // Then check these problems of R2C and C2R. After that, we could open arbitrary permutations in the // main tests. // // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels // invloved. INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, accuracy_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex_half, accuracy_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // Create an array parameters for strided 2D batched transforms. inline auto param_generator_complex_1d_batched_2d(const double base_prob, const std::vector>& v_lengths, const std::vector& precision_range, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range) { std::vector params; for(auto& transform_type : trans_type_range_complex) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { assert(false); continue; } for(const auto precision : precision_range) { for(const auto& types : generate_types(transform_type, place_range, true)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = lengths; param.ostride = lengths; param.nbatch = lengths[0]; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = 1; param.odist = 1; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = base_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } } } return params; } INSTANTIATE_TEST_SUITE_P( pow2_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow2_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow3_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow3_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow5_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow5_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( prime_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({prime_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_test_2D.cpp000066400000000000000000000332341501537341300223010ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "accuracy_tests_range.h" using ::testing::ValuesIn; INSTANTIATE_TEST_SUITE_P(pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_2D, pow2_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_2D, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_2D, pow2_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_2D, pow3_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_2D, pow3_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_2D, pow5_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_2D, pow5_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_2D, prime_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_2D, prime_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_2D, mix_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_2D, mix_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_test_3D.cpp000066400000000000000000000322311501537341300222760ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "accuracy_tests_range.h" using ::testing::ValuesIn; INSTANTIATE_TEST_SUITE_P(pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow2_range_3D, pow2_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_3D, pow2_range_half_3D, pow2_range_half_3D}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow2_range_3D, pow2_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow3_range_3D, pow3_range_3D, pow3_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow3_range_3D, pow3_range_3D, pow3_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow5_range_3D, pow5_range_3D, pow5_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow5_range_3D, pow5_range_3D, pow5_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({prime_range_3D, prime_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({prime_range_3D, prime_range_3D, prime_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow3_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow3_range_3D, prime_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(sbrc_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({sbrc_range_3D, sbrc_range_3D, sbrc_range_3D}), precision_range_sp_dp, sbrc_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({inner_batch_3D_range_half, inner_batch_3D_range_half, inner_batch_3D_range_half}), {fft_precision_half}, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(partial_pass_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, partial_pass_adhoc_3D, precision_range_sp_dp, partial_pass_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(partial_pass_3D_callback, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, partial_pass_adhoc_3D, precision_range_sp_dp, partial_pass_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_test_adhoc.cpp000066400000000000000000000371361501537341300231170ustar00rootroot00000000000000// Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" std::vector> adhoc_sizes = { // sizes that exercise L1D_TRTRT subplan of 2D_RTRT or 3D_TRTRTR {1, 220}, {1, 330}, {81, 220, 36}, // L1D_CC subplan of 3D_TRTRTR {4, 4, 8192}, // SBRC 192 with special param {192, 192, 192}, {192, 84, 84}, // Failure with build_CS_3D_BLOCK_RC {680, 128, 128}, // Large 1D primes that fall above the block threshold (length 262144). // Bluestein requires two forwards and one inverse FFTs, and the plan // for these sizes breakdown these FFTs either as: // L1D_TRTRT (T + STOCKHAM + T + STOCKHAM + T) for lengthBlue <= 4096^2 // or // L1D_TRTRT (T + L1D_CC + STOCKHAM_BL_CC + STOCHMAM_BL_RC + T + STOCKHAM + T) // for lengthBlue > 4096^2. {196597}, {25165813}, // 2D single-kernel bluestein size combined with multi-kernel bluestein {19, 2053}, // TILE_UNALIGNED type of SBRC 3D ERC {98, 98, 98}, // 3D_BLOCK_CR {336, 336, 56}, }; const static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(adhoc, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, adhoc_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_adhoc, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, adhoc_sizes, precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // Test that dist is ignored for batch-1 transforms. Normally, // in-place transforms require same dist, but for batch-1 dist isn't // used for anything and differing dist should be allowed. inline auto param_permissive_iodist() { std::vector> lengths = adhoc_sizes; lengths.push_back({4}); std::vector params; for(const auto precision : precision_range_sp_dp) { for(const auto trans_type : trans_type_range) { for(const auto& types : generate_types(trans_type, place_range, true)) { if(std::get<1>(types) != fft_placement_inplace) continue; for(const auto& len : lengths) { fft_params param; param.length = len; param.precision = precision; param.idist = 2; param.odist = 3; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } return params; } INSTANTIATE_TEST_SUITE_P(adhoc_dist, accuracy_test, ::testing::ValuesIn(param_permissive_iodist()), accuracy_test::TestName); inline auto param_adhoc_colmajor() { // generate basic FFTs of adhoc sizes auto params = param_generator(test_prob, adhoc_sizes, {fft_precision_single}, {2}, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, false); // remove any params that are: // - 1D (not enough dims to swap) // - real-complex 2D (we only get to play with higher dims, so // again not enough dims to swap) params.erase(std::remove_if(params.begin(), params.end(), [](const fft_params& param) { if(param.length.size() == 1) return true; if(param.length.size() == 2) { if(param.transform_type == fft_transform_type_real_forward || param.transform_type == fft_transform_type_real_inverse) return true; } return false; }), params.end()); // reverse length/stride order on remaining params to make them // col-major std::for_each(params.begin(), params.end(), [](fft_params& param) { size_t start_dim = 0; // for real-complex we can't touch the fastest dim if(param.transform_type == fft_transform_type_real_forward || param.transform_type == fft_transform_type_real_inverse) ++start_dim; std::reverse(param.length.rbegin() + start_dim, param.length.rend()); std::reverse(param.istride.rbegin() + start_dim, param.istride.rend()); std::reverse(param.ostride.rbegin() + start_dim, param.ostride.rend()); }); return params; } INSTANTIATE_TEST_SUITE_P(adhoc_colmajor, accuracy_test, ::testing::ValuesIn(param_adhoc_colmajor()), accuracy_test::TestName); inline auto param_adhoc_stride() { std::vector params; for(const auto precision : precision_range_full) { for(const auto& types : generate_types(fft_transform_type_complex_forward, {fft_placement_inplace, fft_placement_notinplace}, true)) { // 2D with non-contiguous strides and dist fft_params param; param.length = {2, 35}; param.precision = precision; param.idist = 200; param.odist = 200; param.transform_type = fft_transform_type_complex_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {90, 2}; param.ostride = {90, 2}; params.push_back(param); } // test C2R/R2C with non-contiguous higher strides and dist - we // want unit stride for length0 so we do the even-length optimization for(const auto& types : generate_types(fft_transform_type_real_forward, {fft_placement_notinplace}, true)) { fft_params param; param.length = {4, 4, 4}; param.precision = precision; param.idist = 0; param.odist = 0; param.transform_type = fft_transform_type_real_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {16, 4, 1}; param.ostride = {16, 4, 1}; param.validate(); { const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } else { if(param.valid(0)) { params.push_back(param); } } } param.length = {2, 2, 2}; param.precision = precision; param.idist = 0; param.odist = 0; param.transform_type = fft_transform_type_real_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {20, 6, 1}; param.ostride = {20, 6, 1}; param.validate(); { const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } else { if(param.valid(0)) { params.push_back(param); } } } } } return params; } INSTANTIATE_TEST_SUITE_P(adhoc_stride, accuracy_test, ::testing::ValuesIn(param_adhoc_stride()), accuracy_test::TestName); const auto adhoc_tokens = { // clang-format off "complex_forward_len_4_4_4_single_op_batch_2_istride_16_4_1_CI_ostride_4_16_1_CI_idist_64_odist_64_ioffset_0_0_ooffset_0_0", "complex_forward_len_512_64_single_ip_batch_3_istride_192_3_CI_ostride_192_3_CI_idist_1_odist_1_ioffset_0_0_ooffset_0_0", "real_forward_len_1024_1024_1024_single_op_batch_1_istride_1048576_1024_1_R_ostride_525312_513_1_HI_idist_1073741824_odist_537919488_ioffset_0_0_ooffset_0_0", "complex_forward_len_6144_single_ip_batch_34_istride_35_CI_ostride_35_CI_idist_1_odist_1_ioffset_0_0_ooffset_0_0", "real_forward_len_8192_single_ip_batch_65537_istride_1_R_ostride_1_HI_idist_8194_odist_4097_ioffset_0_0_ooffset_0_0", "real_forward_len_520_single_op_batch_270400_istride_1_R_ostride_1_HI_idist_520_odist_261_ioffset_0_0_ooffset_0_0", "real_forward_len_630_single_op_batch_396900_istride_1_R_ostride_1_HI_idist_630_odist_316_ioffset_0_0_ooffset_0_0", "real_forward_len_660_single_op_batch_435600_istride_1_R_ostride_1_HI_idist_660_odist_331_ioffset_0_0_ooffset_0_0", "real_forward_len_700_single_op_batch_490000_istride_1_R_ostride_1_HI_idist_700_odist_351_ioffset_0_0_ooffset_0_0", "real_forward_len_728_single_op_batch_529984_istride_1_R_ostride_1_HI_idist_728_odist_365_ioffset_0_0_ooffset_0_0", "real_forward_len_968_single_op_batch_937024_istride_1_R_ostride_1_HI_idist_968_odist_485_ioffset_0_0_ooffset_0_0", "real_forward_len_1020_single_op_batch_1040400_istride_1_R_ostride_1_HI_idist_1020_odist_511_ioffset_0_0_ooffset_0_0", "real_forward_len_378_42_single_ip_batch_66000_istride_44_1_R_ostride_22_1_HI_idist_16632_odist_8316_ioffset_0_0_ooffset_0_0", "real_forward_len_527_25_single_ip_batch_67500_istride_26_1_R_ostride_13_1_HI_idist_13702_odist_6851_ioffset_0_0_ooffset_0_0", "real_forward_len_630_38_single_ip_batch_65540_istride_40_1_R_ostride_20_1_HI_idist_25200_odist_12600_ioffset_0_0_ooffset_0_0", // clang-format on }; INSTANTIATE_TEST_SUITE_P(adhoc_token, accuracy_test, ::testing::ValuesIn(param_generator_token(test_prob, adhoc_tokens)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_test_callback.cpp000066400000000000000000000141011501537341300235600ustar00rootroot00000000000000// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" std::vector> callback_sizes = { // some single kernel sizes {4}, {16}, {81}, {100}, // L1D_TRTRT sizes {220}, {330}, {1344}, // L1D_CC sizes {8192}, {10000}, // prime {23}, {29}, // 2D_SINGLE sizes, small and big {16, 8}, {32, 32}, {9, 81}, {27, 81}, {81, 27}, {256, 9}, {9, 256}, {125, 32}, {32, 125}, // 2D_RTRT {20, 40}, {81, 81}, // 2D_RC {128, 64}, {128, 256}, // more complicated children of 2D_RTRT (L1D_TRTRT, L1D_CC, prime) {4, 220}, {220, 4}, {4, 8192}, {8192, 4}, {4, 23}, {23, 4}, // 3D_TRTRTR, with complicated children {63, 5, 6}, {6, 5, 63}, {23, 5, 6}, {6, 5, 23}, {70, 5, 6}, {6, 5, 70}, {8192, 5, 6}, {6, 5, 8192}, // 3D_RTRT, with complicated children {23, 4, 4}, {4, 4, 23}, {70, 4, 4}, {4, 4, 70}, {8192, 4, 4}, {4, 4, 8192}, // 3D odd lengths {27, 27, 27}, // 3D_BLOCK_RC {64, 64, 64}, }; const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; auto forward_transform_types = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; INSTANTIATE_TEST_SUITE_P(callback, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, forward_transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_callback, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, forward_transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, true)), accuracy_test::TestName); // one of the obvious use cases for callbacks is to implement result // scaling manually, so use the same sizes to test rocFFT's own // result scaling feature. inline auto param_generator_scaling(const std::vector>& v_lengths) { auto params = param_generator(test_prob, callback_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true); for(auto& param : params) param.scale_factor = 7.23; return params; } INSTANTIATE_TEST_SUITE_P(scaling, accuracy_test, ::testing::ValuesIn(param_generator_scaling(callback_sizes)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_test_checkstride.cpp000066400000000000000000000132151501537341300243210ustar00rootroot00000000000000// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" extern bool fftw_compare; inline auto param_checkstride() { // checkstride requires us to copy data back to the host for // checking, which we only do when comparing against FFTW. if(!fftw_compare) return std::vector{}; // tuples of length,stride,nbatch,dist to test. strides are arranged so // there's space either between elements on the fastest dim, or // between dims, or both. std::vector, std::vector, size_t, size_t>> sizes = { // 1D single kernel non-unit stride {{64}, {2}, 2, 140}, // 1D single kernel unit stride but non-contiguous batch {{64}, {1}, 2, 80}, // 1D odd length (to test odd-length R2C/C2R) {{15}, {2}, 2, 40}, // 1D SBCC+SBRC {{8192}, {2}, 2, 17000}, // 1D TRTRT {{24000}, {2}, 2, 50000}, // 2D_RTRT {{20, 30}, {80, 2}, 2, 1700}, {{40, 30}, {80, 2}, 2, 3600}, // 2D_RTRT unit stride along fast dim {{20, 30}, {40, 1}, 2, 1000}, {{40, 30}, {40, 1}, 2, 2000}, // 2D_RC {{64, 64}, {130, 2}, 2, 8400}, // 3D_RC {{64, 64, 64}, {8400, 130, 2}, 2, 540000}, // 3D_RTRTRT {{2, 3, 4}, {40, 10, 2}, 2, 100}, // bigger 3D_RTRTRT {{30, 40, 50}, {3000, 60, 1}, 2, 100000}, }; std::vector params; for(const auto trans_type : trans_type_range) { for(const auto& s : sizes) { for(const auto precision : precision_range_sp_dp) { for(const auto& types : generate_types(trans_type, {fft_placement_notinplace}, true)) { for(bool callback : {true, false}) { // callbacks don't work for planar bool is_planar = std::get<2>(types) == fft_array_type_complex_planar || std::get<2>(types) == fft_array_type_hermitian_planar || std::get<3>(types) == fft_array_type_complex_planar || std::get<3>(types) == fft_array_type_hermitian_planar; if(callback && is_planar) continue; fft_params param; param.length = std::get<0>(s); param.istride = std::get<1>(s); param.ostride = std::get<1>(s); param.nbatch = std::get<2>(s); param.precision = precision; param.idist = std::get<3>(s); param.odist = std::get<3>(s); param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.run_callbacks = callback; param.check_output_strides = true; param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } } return params; } INSTANTIATE_TEST_SUITE_P(checkstride, accuracy_test, ::testing::ValuesIn(param_checkstride()), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_test_emulation.cpp000066400000000000000000000164421501537341300240330ustar00rootroot00000000000000 // Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "accuracy_tests_range.h" const auto emulation_tokens = { // clang-format off "complex_forward_len_4_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_4_odist_4_ioffset_0_0_ooffset_0_0", "complex_forward_len_4_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_4_odist_4_ioffset_0_0_ooffset_0_0", "complex_forward_len_8_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_8_odist_8_ioffset_0_0_ooffset_0_0", "complex_forward_len_16_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_16_odist_16_ioffset_0_0_ooffset_0_0", "complex_forward_len_32_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_32_odist_32_ioffset_0_0_ooffset_0_0", "complex_forward_len_64_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_64_odist_64_ioffset_0_0_ooffset_0_0", "complex_forward_len_128_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_128_odist_128_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_27_odist_27_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_27_odist_27_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_27_double_ip_batch_1_istride_27_1_CI_ostride_27_1_CI_idist_729_odist_729_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_27_single_ip_batch_1_istride_27_1_CI_ostride_27_1_CI_idist_729_odist_729_ioffset_0_0_ooffset_0_0", "complex_forward_len_125_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_125_odist_125_ioffset_0_0_ooffset_0_0", "complex_forward_len_125_125_single_ip_batch_1_istride_125_1_CI_ostride_125_1_CI_idist_15625_odist_15625_ioffset_0_0_ooffset_0_0", "complex_forward_len_121_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_121_odist_121_ioffset_0_0_ooffset_0_0", "complex_forward_len_121_121_single_ip_batch_1_istride_121_1_CI_ostride_121_1_CI_idist_14641_odist_14641_ioffset_0_0_ooffset_0_0", "complex_forward_len_216_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_216_odist_216_ioffset_0_0_ooffset_0_0", "complex_forward_len_10000_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_10000_odist_10000_ioffset_0_0_ooffset_0_0", "complex_forward_len_128_50_128_single_ip_batch_1_istride_6400_128_1_CI_ostride_6400_128_1_CI_idist_819200_odist_819200_ioffset_0_0_ooffset_0_0", "real_forward_len_16_256_256_single_op_batch_2_istride_65536_256_1_R_ostride_33024_129_1_HI_idist", "real_forward_len_256_128_256_single_op_batch_1_istride_32768_256_1_R_ostride_16512_129_1_HI_idist" // clang-format on }; INSTANTIATE_TEST_SUITE_P(emulation_token, accuracy_test, ::testing::ValuesIn(param_generator_token(emulation_prob, emulation_tokens)), accuracy_test::TestName); const static std::vector emulation_range_1D = {2, 3, 5, 16, 17, 29, 32, 64, 75, 128, 200, 256, 288, 298}; const static std::vector emulation_range_2D = {2, 3, 5, 16, 29, 17, 64, 76, 96, 112, 128, 150, 315}; const static std::vector emulation_range_3D = {2, 3, 5, 16, 29, 17, 32, 64, 128, 256}; INSTANTIATE_TEST_SUITE_P(emulation_1D, accuracy_test, ::testing::ValuesIn(param_generator(emulation_prob, generate_lengths({emulation_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(emulation_2D, accuracy_test, ::testing::ValuesIn(param_generator(emulation_prob, generate_lengths({emulation_range_2D, emulation_range_2D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(emulation_3D, accuracy_test, ::testing::ValuesIn(param_generator(emulation_prob, generate_lengths({emulation_range_3D, emulation_range_3D, emulation_range_3D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/accuracy_tests_range.h000066400000000000000000000213561501537341300227620ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ACCURACY_TESTS_RANGE_H #define ACCURACY_TESTS_RANGE_H #include const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 1D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- // TODO: handle special case where length=2 for real/complex transforms. const static std::vector pow2_range_1D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824}; const static std::vector pow2_range_half_1D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; const static std::vector pow3_range_1D = {3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, 1594323, 4782969, 14348907, 43046721, 129140163, 387420489}; const static std::vector pow5_range_1D = {5, 25, 125, 625, 3125, 15625, 78125, 390625, 1953125, 9765625, 48828125, 244140625}; // radix 7, 11, 13 sizes that are either pure powers or sizes people have wanted in the wild const static std::vector radX_range_1D = {7, 49, 84, 112, 11, 13, 52, 104, 208, 343, 2401, 16807}; const static std::vector mix_range_1D = {6, 10, 12, 15, 20, 30, 56, 120, 150, 225, 240, 300, 336, 486, 600, 900, 1250, 1500, 1875, 2160, 2187, 2250, 2500, 3000, 4000, 12000, 24000, 72000}; const static std::vector prime_range_1D = {17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; static std::vector small_1D_sizes() { static const size_t SMALL_1D_MAX = 8192; // generate a list of sizes from 2 and up, skipping any sizes that are already covered std::vector covered_sizes; std::copy(pow2_range_1D.begin(), pow2_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(pow3_range_1D.begin(), pow3_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(pow5_range_1D.begin(), pow5_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(radX_range_1D.begin(), radX_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(mix_range_1D.begin(), mix_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(prime_range_1D.begin(), prime_range_1D.end(), std::back_inserter(covered_sizes)); std::sort(covered_sizes.begin(), covered_sizes.end()); std::vector output; for(size_t i = 2; i < SMALL_1D_MAX; ++i) { if(!std::binary_search(covered_sizes.begin(), covered_sizes.end(), i)) { output.push_back(i); } } return output; } const static std::vector batch_range_1D = {4, 2, 1}; const static std::vector> stride_range_for_prime_1D = {{1}, {2}, {3}, {64}, {65}}; //TODO: this will be merged back to stride_range const static std::vector pow2_range_for_stride_1D = {4096, 8192, 524288}; const static std::vector pow2_range_for_stride_half_1D = {4096, 8192}; const static std::vector> stride_range_for_pow2_1D = {{2}, {3}}; const static std::vector batch_range_for_stride_1D = {2, 1}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 2D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- const static std::vector pow2_range_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; const static std::vector pow3_range_2D = {3, 9, 27, 81, 243, 729, 2187, 6561}; const static std::vector pow5_range_2D = {5, 25, 125, 625, 3125, 15625}; const static std::vector prime_range_2D = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; const static std::vector mix_range_2D = {56, 120, 336, 2160, 5000, 6000, 8000}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 3D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- const static std::vector pow2_range_3D = {4, 8, 16, 32, 128, 256}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half_3D = {4, 8, 16, 32}; const static std::vector pow3_range_3D = {3, 9, 27, 81, 243}; const static std::vector pow5_range_3D = {5, 25, 125}; const static std::vector prime_range_3D = {7, 11, 13, 17, 19, 23, 29}; // SBCC+SBRC as a sub-node of a 3D TRTRTR const static std::vector> pow2_adhoc_3D = {{4, 4, 8192}}; // Test combinations of SBRC sizes, plus a non-SBRC size (10) to // exercise fused SBRC+transpose kernels. const static std::vector sbrc_range_3D = {50, 64, 81, 100, 200, 10, 128, 256}; const static std::vector sbrc_batch_range_3D = {2, 1}; // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't const static std::vector inner_batch_3D_range = {4, 8, 16, 32, 20, 24, 64}; const static std::vector inner_batch_3D_range_half = {4, 8, 16, 32, 20, 24}; const static std::vector inner_batch_3D_batch_range = {3, 2, 1}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // partial pass test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- const static std::vector> partial_pass_adhoc_3D = {{64, 64, 64}}; const static std::vector partial_pass_batch_range_3D = {1, 5, 10, 20, 50}; #endif // ACCURACY_TESTS_RANGE_HrocFFT-rocm-6.4.3/clients/tests/bitwise_repro/000077500000000000000000000000001501537341300212675ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/tests/bitwise_repro/bitwise_repro_db.h000066400000000000000000000432541501537341300247720ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef BITWISE_REPRO_DB_H #define BITWISE_REPRO_DB_H #include "../../../shared/fft_hash.h" #include "sqlite3.h" #include #include #include #include #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif typedef size_t default_hash_type; template struct rocfft_test_run { rocfft_test_run(Tint ibuffer_hash_real_, Tint ibuffer_hash_imag_, Tint obuffer_hash_real_, Tint obuffer_hash_imag_, std::string token_, std::string runtime_version_, std::string lib_version_, std::string gpu_architecure_) : ibuffer_hash_real(ibuffer_hash_real_) , ibuffer_hash_imag(ibuffer_hash_imag_) , obuffer_hash_real(obuffer_hash_real_) , obuffer_hash_imag(obuffer_hash_imag_) , token(token_) , runtime_version(runtime_version_) , lib_version(lib_version_) , gpu_architecture(gpu_architecure_) { } static std::string get_create_rocfft_test_run_sql() { return "CREATE TABLE IF NOT EXISTS rocfft_test_run(ibuffer_hash_real TEXT NOT NULL, " "ibuffer_hash_imag TEXT NOT NULL, obuffer_hash_real TEXT NOT NULL, " "obuffer_hash_imag TEXT NOT NULL, token TEXT NOT NULL, runtime_version TEXT NOT " "NULL, lib_version TEXT NOT NULL, gpu_architecture TEXT NOT NULL); CREATE UNIQUE " "INDEX IF NOT EXISTS idx_unique_run ON rocfft_test_run(token, runtime_version, " "lib_version, gpu_architecture);"; } static std::string get_match_sql() { return "SELECT ibuffer_hash_real, ibuffer_hash_imag, obuffer_hash_real, obuffer_hash_imag, " "token, runtime_version, lib_version, gpu_architecture FROM rocfft_test_run WHERE " "token = ? AND runtime_version = ? AND lib_version = ? AND gpu_architecture = ? "; } static std::string get_insert_sql() { return "INSERT INTO rocfft_test_run(ibuffer_hash_real, ibuffer_hash_imag, " "obuffer_hash_real, obuffer_hash_imag, token, runtime_version, lib_version, " "gpu_architecture) VALUES (?,?,?,?,?,?,?,?)"; } void bind_insert_statement(sqlite3_stmt* stmt) { bind_ibuffer_hash_real(stmt, 1); bind_ibuffer_hash_imag(stmt, 2); bind_obuffer_hash_real(stmt, 3); bind_obuffer_hash_imag(stmt, 4); bind_token(stmt, 5); bind_runtime_version(stmt, 6); bind_lib_version(stmt, 7); bind_gpu_architecture(stmt, 8); } void bind_match_statement(sqlite3_stmt* stmt) { bind_token(stmt, 1); bind_runtime_version(stmt, 2); bind_lib_version(stmt, 3); bind_gpu_architecture(stmt, 4); } void update(sqlite3_stmt* stmt) { for(int col = 0; col < sqlite3_column_count(stmt); ++col) { auto col_name = sqlite3_column_name(stmt, col); auto col_value = std::string(reinterpret_cast(sqlite3_column_text(stmt, col))); if(strcmp(col_name, "ibuffer_hash_real") == 0) update_ibuffer_hash_real(col_value); if(strcmp(col_name, "ibuffer_hash_imag") == 0) update_ibuffer_hash_imag(col_value); if(strcmp(col_name, "obuffer_hash_real") == 0) update_obuffer_hash_real(col_value); if(strcmp(col_name, "obuffer_hash_imag") == 0) update_obuffer_hash_imag(col_value); if(strcmp(col_name, "token") == 0) update_token(col_value); if(strcmp(col_name, "runtime_version") == 0) update_runtime_version(col_value); if(strcmp(col_name, "lib_version") == 0) update_lib_version(col_value); if(strcmp(col_name, "gpu_architecture") == 0) update_gpu_architecture(col_value); } } Tint ibuffer_hash_real; Tint ibuffer_hash_imag; Tint obuffer_hash_real; Tint obuffer_hash_imag; private: std::string token; std::string runtime_version; std::string lib_version; std::string gpu_architecture; std::string get_ibuffer_hash_real() const { return std::to_string(ibuffer_hash_real); } std::string get_ibuffer_hash_imag() const { return std::to_string(ibuffer_hash_imag); } std::string get_obuffer_hash_real() const { return std::to_string(obuffer_hash_real); } std::string get_obuffer_hash_imag() const { return std::to_string(obuffer_hash_imag); } std::string get_token() const { return token; } std::string get_runtime_version() const { return runtime_version; } std::string get_lib_version() const { return lib_version; } std::string get_gpu_architecture() const { return gpu_architecture; } void bind_ibuffer_hash_real(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_ibuffer_hash_real().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding ibuffer_hash_real field in insert statement")); } void bind_ibuffer_hash_imag(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_ibuffer_hash_imag().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding ibuffer_hash_imag field in insert statement")); } void bind_obuffer_hash_real(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_obuffer_hash_real().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding obuffer_hash_real field in insert statement")); } void bind_obuffer_hash_imag(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_obuffer_hash_imag().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding obuffer_hash_imag field in insert statement")); } void bind_token(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_token().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error(std::string("Error binding token field in insert statement")); } void bind_runtime_version(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_runtime_version().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding runtime_version field in insert statement")); } void bind_lib_version(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_lib_version().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding lib_version field in insert statement")); } void bind_gpu_architecture(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_gpu_architecture().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding gpu_architecture field in insert statement")); } void update_ibuffer_hash_real(const std::string& value) { std::stringstream stream(value); stream >> ibuffer_hash_real; } void update_ibuffer_hash_imag(const std::string& value) { std::stringstream stream(value); stream >> ibuffer_hash_imag; } void update_obuffer_hash_real(const std::string& value) { std::stringstream stream(value); stream >> obuffer_hash_real; } void update_obuffer_hash_imag(const std::string& value) { std::stringstream stream(value); stream >> obuffer_hash_imag; } void update_token(const std::string& value) { token = value; } void update_runtime_version(const std::string& value) { runtime_version = value; } void update_lib_version(const std::string& value) { lib_version = value; } void update_gpu_architecture(const std::string& value) { gpu_architecture = value; } }; template inline rocfft_test_run get_rocfft_test_run(const hash_output& ibuffer_hash, const hash_output& obuffer_hash, const std::string& token) { hipDeviceProp_t device_prop; if(hipGetDeviceProperties(&device_prop, 0) != hipSuccess) throw std::runtime_error("hipGetDeviceProperties failure"); auto gpu_arch = std::string(device_prop.gcnArchName); auto ver_sep = std::string("."); auto runtime_ver = std::to_string(HIP_VERSION_MAJOR) + ver_sep + std::to_string(HIP_VERSION_MINOR); const size_t ver_size = 256; char lib_version[ver_size]; rocfft_get_version_string(lib_version, ver_size); auto lib_ver_full = std::string(lib_version); auto idx_maj = lib_ver_full.find(ver_sep); auto idx_min = lib_ver_full.find(ver_sep, idx_maj + 1); auto idx_rev = lib_ver_full.find(ver_sep, idx_min + 1); auto ver_maj = lib_ver_full.substr(0, idx_maj); auto ver_min = lib_ver_full.substr(idx_maj + 1, idx_min - idx_maj - 1); auto ver_rev = lib_ver_full.substr(idx_min + 1, idx_rev - idx_min - 1); auto lib_ver = ver_maj + ver_sep + ver_min + ver_sep + ver_rev; return rocfft_test_run(ibuffer_hash.buffer_real, ibuffer_hash.buffer_imag, obuffer_hash.buffer_real, obuffer_hash.buffer_imag, token, runtime_ver, lib_ver, gpu_arch); } class fft_hash_db { public: fft_hash_db(std::string db_path) : ret(SQLITE_OK) , db_connection(nullptr) , begin_stmt(nullptr) , end_stmt(nullptr) , insert_stmt(nullptr) , match_stmt(nullptr) { ret = sqlite3_open(db_path.c_str(), &db_connection); if(ret != SQLITE_OK) throw std::runtime_error(std::string("Cannot open repro-db: ") + db_path); // Access to a database file may occur in parallel. // Increase default sqlite timeout, so diferent process // can wait for one another. sqlite3_busy_timeout(db_connection, 30000); // Set sqlite3 engine to WAL mode to avoid potential deadlocks with multiple // concurrent processes (if a deadlock occurs, the busy timeout is not honored). ret = sqlite3_exec(db_connection, "PRAGMA journal_mode = WAL", nullptr, nullptr, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Error setting WAL mode: " + std::string(sqlite3_errmsg(db_connection))); ret = sqlite3_exec(db_connection, rocfft_test_run<>::get_create_rocfft_test_run_sql().c_str(), nullptr, nullptr, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Error creating table: " + std::string(sqlite3_errmsg(db_connection))); prepare_begin_end_stmts(); prepare_match_stmt(); prepare_insert_stmt(); } ~fft_hash_db() { sqlite3_finalize(begin_stmt); sqlite3_finalize(end_stmt); sqlite3_finalize(match_stmt); sqlite3_finalize(insert_stmt); sqlite3_close(db_connection); } template void check_hash_valid(const hash_output& ibuffer_hash, const hash_output& obuffer_hash, const std::string& token, bool& hash_entry_found, bool& hash_valid) { hash_valid = true; auto test_run = get_rocfft_test_run(ibuffer_hash, obuffer_hash, token); begin_transaction(); hash_entry_found = check_match(&test_run); if(hash_entry_found) hash_valid = (test_run.ibuffer_hash_real == ibuffer_hash.buffer_real && test_run.ibuffer_hash_imag == ibuffer_hash.buffer_imag && test_run.obuffer_hash_real == obuffer_hash.buffer_real && test_run.obuffer_hash_imag == obuffer_hash.buffer_imag) ? true : false; else insert(&test_run); end_transaction(); } private: void prepare_begin_end_stmts() { auto begin_sql = std::string("BEGIN TRANSACTION;"); ret = sqlite3_prepare_v2(db_connection, begin_sql.c_str(), -1, &begin_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare begin statement: " + std::string(sqlite3_errmsg(db_connection))); auto end_sql = std::string("END TRANSACTION;"); ret = sqlite3_prepare_v2(db_connection, end_sql.c_str(), -1, &end_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare end statement: " + std::string(sqlite3_errmsg(db_connection))); } void prepare_match_stmt() { auto match_sql = rocfft_test_run<>::get_match_sql(); ret = sqlite3_prepare_v2(db_connection, match_sql.c_str(), -1, &match_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare match statement: " + std::string(sqlite3_errmsg(db_connection))); } void prepare_insert_stmt() { auto insert_sql = rocfft_test_run<>::get_insert_sql(); ret = sqlite3_prepare_v2(db_connection, insert_sql.c_str(), -1, &insert_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare insert statement: " + std::string(sqlite3_errmsg(db_connection))); } void begin_transaction() { ret = sqlite3_step(begin_stmt); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing begin statement: ") + std::string(sqlite3_errmsg(db_connection))); } void end_transaction() { ret = sqlite3_step(end_stmt); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing end statement: ") + std::string(sqlite3_errmsg(db_connection))); } template bool check_match(rocfft_test_run* entry) { sqlite3_reset(match_stmt); entry->bind_match_statement(match_stmt); size_t match_count = 0; while((ret = sqlite3_step(match_stmt)) == SQLITE_ROW) { entry->update(match_stmt); match_count++; } // There can only be one result in this query if(match_count > 1) throw std::runtime_error("Corrupted database"); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing select statement: ") + std::string(sqlite3_errmsg(db_connection))); return match_count; } template void insert(rocfft_test_run* entry) { sqlite3_reset(insert_stmt); entry->bind_insert_statement(insert_stmt); ret = sqlite3_step(insert_stmt); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing insert statement: ") + std::string(sqlite3_errmsg(db_connection))); } int ret; sqlite3* db_connection; sqlite3_stmt* begin_stmt; sqlite3_stmt* end_stmt; sqlite3_stmt* insert_stmt; sqlite3_stmt* match_stmt; }; #endif // BITWISE_REPRO_DB_HrocFFT-rocm-6.4.3/clients/tests/bitwise_repro/bitwise_repro_test.cpp000066400000000000000000000767551501537341300257330ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../../shared/params_gen.h" #include "../../../shared/rocfft_params.h" #include "../accuracy_tests_range.h" #include "bitwise_repro_test.h" using ::testing::ValuesIn; TEST(bitwise_repro_test, compare_precisions) { rocfft_params params_1; // clang-format off params_1.from_token(std::string("complex_forward_len_192_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_192_odist_192_ioffset_0_0_ooffset_0_0")); // clang-format on params_1.validate(); rocfft_params params_2; // clang-format off params_2.from_token(std::string("complex_forward_len_192_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_192_odist_192_ioffset_0_0_ooffset_0_0")); // clang-format on params_2.validate(); if(!params_1.valid(verbose) || !params_2.valid(verbose)) { if(verbose) std::cout << "Invalid parameters, skip this test." << std::endl; GTEST_SKIP(); } try { bitwise_repro(params_1, params_2); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } TEST(bitwise_repro_test, compare_lengths) { rocfft_params params_1; // clang-format off params_1.from_token(std::string("complex_forward_len_64_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_64_odist_64_ioffset_0_0_ooffset_0_0")); // clang-format on params_1.validate(); rocfft_params params_2; // clang-format off params_2.from_token(std::string("complex_forward_len_32_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_32_odist_32_ioffset_0_0_ooffset_0_0")); // clang-format on params_2.validate(); if(!params_1.valid(verbose) || !params_2.valid(verbose)) { if(verbose) std::cout << "Invalid parameters, skip this test." << std::endl; GTEST_SKIP(); } try { bitwise_repro(params_1, params_2); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } TEST(bitwise_repro_test, compare_transform_types) { rocfft_params params_1; // clang-format off params_1.from_token(std::string("complex_forward_len_256_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_256_odist_256_ioffset_0_0_ooffset_0_0")); // clang-format on params_1.validate(); rocfft_params params_2; // clang-format off params_2.from_token(std::string("complex_inverse_len_256_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_256_odist_256_ioffset_0_0_ooffset_0_0")); // clang-format on params_2.validate(); if(!params_1.valid(verbose) || !params_2.valid(verbose)) { if(verbose) std::cout << "Invalid parameters, skip this test." << std::endl; GTEST_SKIP(); } try { bitwise_repro(params_1, params_2); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } TEST_P(bitwise_repro_test, compare_to_reference) { if(repro_db == nullptr) GTEST_SKIP() << "A database file is required for this test." << std::endl; rocfft_params params(GetParam()); params.validate(); // Test that the tokenization works as expected. auto token = params.token(); fft_params tokentest; tokentest.from_token(token); auto token1 = tokentest.token(); EXPECT_EQ(token, token1); if(!params.valid(verbose)) { if(verbose) { std::cout << "Invalid parameters, skip this test." << std::endl; } GTEST_SKIP(); } try { bitwise_repro(params); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 1D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- INSTANTIATE_TEST_SUITE_P(pow2_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_1D}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved INSTANTIATE_TEST_SUITE_P(small_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator_base( test_prob, {fft_transform_type_complex_forward}, generate_lengths({small_1D_sizes()}), {fft_precision_single}, {1}, [](fft_transform_type t, const std::vector& place_range, const bool planar) { return std::vector{ std::make_tuple(t, place_range[0], fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)}; }, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, bitwise_repro_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex_half, bitwise_repro_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, bitwise_repro_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, bitwise_repro_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 2D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- INSTANTIATE_TEST_SUITE_P(pow2_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_2D, pow2_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_2D, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_2D, pow3_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_2D, pow5_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_2D, prime_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_2D, mix_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 3D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- INSTANTIATE_TEST_SUITE_P(pow2_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow2_range_3D, pow2_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_3D, pow2_range_half_3D, pow2_range_half_3D}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow3_range_3D, pow3_range_3D, pow3_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow5_range_3D, pow5_range_3D, pow5_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({prime_range_3D, prime_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow3_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(sbrc_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({sbrc_range_3D, sbrc_range_3D, sbrc_range_3D}), precision_range_sp_dp, sbrc_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({inner_batch_3D_range_half, inner_batch_3D_range_half, inner_batch_3D_range_half}), {fft_precision_half}, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); rocFFT-rocm-6.4.3/clients/tests/bitwise_repro/bitwise_repro_test.h000066400000000000000000000337521501537341300253660ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef BITWISE_REPRO_TEST_H #define BITWISE_REPRO_TEST_H #include #include #include #include #include #include "../../../shared/accuracy_test.h" #include "../../../shared/enum_to_string.h" #include "../../../shared/fft_params.h" #include "../../../shared/gpubuf.h" #include "../../../shared/rocfft_params.h" #include "../../../shared/test_params.h" #include "bitwise_repro_db.h" extern int verbose; extern std::unique_ptr repro_db; // Base gtest class for bitwise reproduction tests class bitwise_repro_test : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; // execute the GPU transform template inline void execute_fft(Tparams& params, std::vector& pibuffer, std::vector& pobuffer, std::vector& obuffer, std::vector& gpu_output) { // Execute the transform: auto fft_status = params.execute(pibuffer.data(), pobuffer.data()); if(fft_status != fft_status_success) throw std::runtime_error("rocFFT plan execution failure"); ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) { ASSERT_TRUE(gpu_output[idx].data() != nullptr) << "output buffer index " << idx << " is empty"; auto hip_status = hipMemcpy(gpu_output[idx].data(), pobuffer.at(idx), gpu_output[idx].size(), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream msg; msg << "hipMemcpy failure"; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } } if(verbose > 2) { std::cout << "GPU output:\n"; params.print_obuffer(gpu_output); } if(verbose > 5) { std::cout << "flat GPU output:\n"; params.print_obuffer_flat(gpu_output); } } template void compute_fft_data(Tparams& params, std::vector& fft_input, std::vector& fft_output) { // Call hipGetLastError to reset any errors // returned by previous HIP runtime API calls. hipError_t hip_status = hipGetLastError(); // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); // Make sure FFT buffers fit in device memory check_problem_fits_device_memory(params, verbose); auto ibuffer_sizes = params.ibuffer_sizes(); auto obuffer_sizes = params.obuffer_sizes(); // Create FFT plan - this will also allocate work buffer, but // will throw a specific exception if that step fails auto plan_status = fft_status_success; try { plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { ++n_hip_failures; std::stringstream msg; msg << "Work buffer allocation failed with size: " << params.workbuffersize; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); if(hip_status != hipSuccess) { std::stringstream msg; msg << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); ++n_hip_failures; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } pibuffer[i] = ibuffer[i].data(); } // allocation counts in elements, ibuffer_sizes is in bytes auto ibuffer_sizes_elems = ibuffer_sizes; for(auto& buf : ibuffer_sizes_elems) buf /= var_size(params.precision, params.itype); fft_input = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); // generate input based on either cpu/gpu #ifdef USE_HIPRAND // generate the input directly on the gpu params.compute_input(ibuffer); // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { hip_status = hipMemcpy(fft_input.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { std::stringstream msg; msg << "hipMemcpy failure with error " << hip_status; ++n_hip_failures; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } } #else // generate the input on the cpu params.compute_input(fft_input); // Copy input to GPU for(unsigned int idx = 0; idx < fft_input.size(); ++idx) { hip_status = hipMemcpy(ibuffer[idx].data(), fft_input.at(idx).data(), ibuffer_sizes[idx], hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream ss; ss << "hipMemcpy failure with error " << hip_status; if(skip_runtime_fails) { throw ROCFFT_SKIP{ss.str()}; } else { throw ROCFFT_FAIL{ss.str()}; } } } #endif std::vector obuffer_data; std::vector* obuffer = &obuffer_data; std::vector pobuffer; // allocate the output buffer if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream msg; msg << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } } } pobuffer.resize(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // execute GPU transform fft_output = allocate_host_buffer(params.precision, params.otype, params.osize); execute_fft(params, pibuffer, pobuffer, *obuffer, fft_output); } template inline void bitwise_repro_impl(Tparams& params, Tparams& params_comp) { std::vector fft_input, fft_output; compute_fft_data(params, fft_input, fft_output); auto ibuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.ilength(), params.istride, params.idist, rocfft_array_type_from_fftparams(params.itype), params.nbatch); auto ibuffer_hash_out = hash_output(); compute_hash(fft_input, ibuffer_hash_in, ibuffer_hash_out); auto obuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.olength(), params.ostride, params.odist, rocfft_array_type_from_fftparams(params.otype), params.nbatch); auto obuffer_hash_out = hash_output(); compute_hash(fft_output, obuffer_hash_in, obuffer_hash_out); if(params_comp.token().compare(params.token()) == 0) { std::stringstream msg; msg << "FFT input tokens are identical"; throw ROCFFT_SKIP{msg.str()}; } std::vector fft_input_comp, fft_output_comp; compute_fft_data(params_comp, fft_input_comp, fft_output_comp); auto obuffer_hash_in_comp = hash_input(rocfft_precision_from_fftparams(params_comp.precision), params_comp.olength(), params_comp.ostride, params_comp.odist, rocfft_array_type_from_fftparams(params_comp.otype), params_comp.nbatch); auto obuffer_hash_out_comp = hash_output(); compute_hash(fft_output_comp, obuffer_hash_in_comp, obuffer_hash_out_comp); params.free(); params_comp.free(); // FFT params are not identical and, therefore, // must also have different fft outputs. ASSERT_FALSE(obuffer_hash_out_comp == obuffer_hash_out) << "Different FFT params have the same output hash."; } template inline void bitwise_repro_impl(Tparams& params) { std::vector fft_input, fft_output; compute_fft_data(params, fft_input, fft_output); auto ibuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.ilength(), params.istride, params.idist, rocfft_array_type_from_fftparams(params.itype), params.nbatch); auto ibuffer_hash_out = hash_output(); compute_hash(fft_input, ibuffer_hash_in, ibuffer_hash_out); auto obuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.olength(), params.ostride, params.odist, rocfft_array_type_from_fftparams(params.otype), params.nbatch); auto obuffer_hash_out = hash_output(); compute_hash(fft_output, obuffer_hash_in, obuffer_hash_out); bool hash_entry_found, hash_valid; if(verbose) { std::cout << "input buffer hash: (" << ibuffer_hash_out.buffer_real << "," << ibuffer_hash_out.buffer_imag << ")" << std::endl; std::cout << "output buffer hash: (" << obuffer_hash_out.buffer_real << "," << obuffer_hash_out.buffer_imag << ")" << std::endl; } repro_db->check_hash_valid( ibuffer_hash_out, obuffer_hash_out, params.token(), hash_entry_found, hash_valid); params.free(); if(hash_entry_found) ASSERT_TRUE(hash_valid) << "FFT result is not bitwise reproducible."; else { std::stringstream msg; msg << "FFT result entry added to the repro-db file. Previously stored reference entry not " "found. \n"; throw ROCFFT_SKIP{msg.str()}; } } inline void bitwise_repro(rocfft_params& params) { switch(params.precision) { case fft_precision_half: bitwise_repro_impl(params); break; case fft_precision_single: bitwise_repro_impl(params); break; case fft_precision_double: bitwise_repro_impl(params); break; } } inline void bitwise_repro(rocfft_params& params, rocfft_params& params_comp) { switch(params.precision) { case fft_precision_half: bitwise_repro_impl(params, params_comp); break; case fft_precision_single: bitwise_repro_impl(params, params_comp); break; case fft_precision_double: bitwise_repro_impl(params, params_comp); break; } } #endif // BITWISE_REPRO_TEST_H rocFFT-rocm-6.4.3/clients/tests/buffer_hash_test.cpp000066400000000000000000000273541501537341300224440ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/fft_hash.h" #include "../../shared/rocfft_params.h" #include #include #include #include #include #include static void set_params(const fft_precision precision, fft_params& param) { std::vector blengths = {16777216}; std::vector unit_strides = {1}; size_t nbatch = 1; std::vector zero_offsets = {0}; auto btype = fft_array_type::fft_array_type_complex_interleaved; param.length = blengths; param.istride = unit_strides; param.ostride = unit_strides; param.nbatch = nbatch; param.precision = precision; param.idist = blengths[0]; param.odist = blengths[0]; param.isize = {blengths[0]}; param.osize = {blengths[0]}; param.itype = btype; param.otype = btype; param.ioffset = zero_offsets; param.ooffset = zero_offsets; param.placement = fft_placement_inplace; } // Create an fft params struct for a contiguous input/output buffer. // Purpose of the unit tests here is only to test the hashing strategy, // i.e., to reduce multiple floating point values to a single 64 bit // identifier. The strategy for hashing a non-contiguous buffer is // essentially the same, only the data access pattern is changed. static void validate_buffer_params(const fft_params& param) { ASSERT_EQ(param.length.size() == 1, true); ASSERT_EQ(param.istride.size() == 1, true); ASSERT_EQ(param.istride[0] == 1, true); ASSERT_EQ(param.ostride.size() == 1, true); ASSERT_EQ(param.ostride[0] == 1, true); ASSERT_EQ(param.ioffset.size() == 1, true); ASSERT_EQ(param.ioffset[0] == 0, true); ASSERT_EQ(param.ooffset.size() == 1, true); ASSERT_EQ(param.ooffset[0] == 0, true); ASSERT_EQ(param.isize.size() == 1, true); ASSERT_EQ(param.isize[0] == param.length[0], true); ASSERT_EQ(param.osize.size() == 1, true); ASSERT_EQ(param.osize[0] == param.length[0], true); ASSERT_EQ(param.nbatch == 1, true); ASSERT_EQ(param.itype == fft_array_type_complex_interleaved, true); ASSERT_EQ(param.otype == fft_array_type_complex_interleaved, true); ASSERT_EQ(param.placement == fft_placement_inplace, true); } static unsigned int gen_seed() { auto seed = static_cast(time(NULL)); return seed; } template static void shuffle_buffer(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::random_device rd; std::mt19937 g(rd()); std::shuffle(idata, idata + N, g); } static void shuffle_buffer(const fft_params& param, const size_t seed, std::vector& buffer) { validate_buffer_params(param); auto N = param.length[0]; switch(param.precision) { case fft_precision_half: shuffle_buffer(N, seed, buffer); break; case fft_precision_double: shuffle_buffer(N, seed, buffer); break; case fft_precision_single: shuffle_buffer(N, seed, buffer); break; default: abort(); } } template static void corrupt_buffer_single(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::minstd_rand gen(seed); std::uniform_real_distribution dist1(0.0f, 1.0f); std::uniform_real_distribution dist2(-1.0f, 1.0f); auto random_id = static_cast(dist1(gen) * static_cast(N - 1)); auto real = idata[random_id].real(); auto imag = idata[random_id].imag(); idata[random_id].real(real + dist2(gen)); idata[random_id].imag(imag + dist2(gen)); } static void corrupt_buffer_single(const fft_params& param, const size_t seed, std::vector& buffer) { validate_buffer_params(param); auto N = param.length[0]; switch(param.precision) { case fft_precision_half: corrupt_buffer_single(N, seed, buffer); break; case fft_precision_double: corrupt_buffer_single(N, seed, buffer); break; case fft_precision_single: corrupt_buffer_single(N, seed, buffer); break; default: abort(); } } template static void corrupt_buffer_full(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); for(size_t i = 0; i < N; i++) { auto real = idata[i].real(); auto imag = idata[i].imag(); idata[i].real(real + dist(gen)); idata[i].imag(imag + dist(gen)); } } static void corrupt_buffer_full(const fft_params& param, const size_t seed, std::vector& buffer) { validate_buffer_params(param); auto N = param.length[0]; switch(param.precision) { case fft_precision_half: corrupt_buffer_full(N, seed, buffer); break; case fft_precision_double: corrupt_buffer_full(N, seed, buffer); break; case fft_precision_single: corrupt_buffer_full(N, seed, buffer); break; default: abort(); } } template static void init_buffer(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); for(size_t i = 0; i < N; i++) { idata[i].real(dist(gen)); idata[i].imag(dist(gen)); } } static void init_buffer(const fft_params& params, const size_t seed, std::vector& buffer) { validate_buffer_params(params); auto N = params.length[0]; switch(params.precision) { case fft_precision_half: init_buffer(N, seed, buffer); break; case fft_precision_double: init_buffer(N, seed, buffer); break; case fft_precision_single: init_buffer(N, seed, buffer); break; default: abort(); } } static void run_test(const rocfft_params& params) { auto hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.ilength(), params.istride, params.idist, rocfft_array_type_from_fftparams(params.itype), params.nbatch); auto hash_out_1 = hash_output(); auto hash_out_2 = hash_output(); auto seed = gen_seed(); std::vector buffer1, buffer2; buffer1 = allocate_host_buffer(params.precision, params.itype, params.ibuffer_sizes()); buffer2 = allocate_host_buffer(params.precision, params.itype, params.ibuffer_sizes()); init_buffer(params, seed, buffer1); compute_hash(buffer1, hash_in, hash_out_1); copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real == hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag == hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); corrupt_buffer_full(params, seed, buffer2); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real != hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag != hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); corrupt_buffer_single(params, seed, buffer2); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real != hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag != hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); shuffle_buffer(params, seed, buffer2); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real != hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag != hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; } TEST(rocfft_UnitTest, buffer_hashing_half) { rocfft_params params; set_params(fft_precision_half, params); try { run_test(params); } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } TEST(rocfft_UnitTest, buffer_hashing_single) { rocfft_params params; set_params(fft_precision_single, params); try { run_test(params); } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } TEST(rocfft_UnitTest, buffer_hashing_double) { rocfft_params params; set_params(fft_precision_double, params); try { run_test(params); } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } rocFFT-rocm-6.4.3/clients/tests/callback_change_type.cpp000066400000000000000000000227671501537341300232360ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/hostbuf.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_complex.h" #include "../../shared/rocfft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/rocfft_against_fftw.h" #include GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(change_type); // callback functions to cast data from short to float __host__ __device__ float load_callback_short(short* input, size_t offset, void* cbdata, void* sharedMem) { return static_cast(input[offset]); } __host__ __device__ float2 load_callback_short2(short2* input, size_t offset, void* cbdata, void* sharedMem) { return float2{static_cast(input[offset].x), static_cast(input[offset].y)}; } __device__ auto load_callback_short_dev = load_callback_short; __device__ auto load_callback_short2_dev = load_callback_short2; class change_type : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; // aim for 1D lengths that might need ordinary Stockham, transpose, // Bluestein kernels to treat real data as complex std::vector> callback_type_sizes = {{4}, {60}, {122}, {220}, {8192}, {4500000}}; // test complex + real forward transforms. real inverse is not a valid // test case here, because we're allowed to overwrite input on those. // the input can't be any smaller than what rocFFT thinks it is, // because the overwrite will fail. const static std::vector> stride_range = {{1}}; INSTANTIATE_TEST_SUITE_P(callback, change_type, ::testing::ValuesIn(param_generator_base( test_prob, {fft_transform_type_complex_forward, fft_transform_type_real_forward}, callback_type_sizes, {fft_precision_single}, {1}, generate_types, stride_range, stride_range, {{0, 0}}, {{0, 0}}, {fft_placement_notinplace}, false, false)), accuracy_test::TestName); // run an out-of-place transform that casts input from short to float TEST_P(change_type, short_to_float) { rocfft_params params(GetParam()); params.run_callbacks = true; ASSERT_EQ(params.create_plan(), fft_status_success); // input has 2 shorts/floats for complex data, 1 otherwise. // output is always complex for these tests. const size_t input_complex = params.transform_type != fft_transform_type_real_forward ? 2 : 1; // allocate gpubuf gpu_input; gpubuf gpu_output; std::vector cpu_input(1); std::vector cpu_output(1); try { // gpu input is actually shorts, everything else is float ASSERT_EQ(gpu_input.alloc(params.isize[0] * sizeof(short) * input_complex), hipSuccess); ASSERT_EQ(gpu_output.alloc(params.osize[0] * sizeof(float) * 2), hipSuccess); cpu_input[0].alloc(params.isize[0] * sizeof(float) * input_complex); cpu_output[0].alloc(params.osize[0] * sizeof(float) * 2); // generate short (16-bit) and float (32-bit) input std::mt19937 gen; std::uniform_int_distribution dis(-3, 3); std::vector cpu_input_short(params.isize[0] * input_complex); for(auto& i : cpu_input_short) i = dis(gen); // copy short input to gpubuf ASSERT_EQ(hipMemcpy(gpu_input.data(), cpu_input_short.data(), sizeof(short) * cpu_input_short.size(), hipMemcpyHostToDevice), hipSuccess); // convert shorts to floats for FFTW input std::copy(cpu_input_short.begin(), cpu_input_short.end(), static_cast(cpu_input[0].data())); // get callback function so we can pass it to rocfft void* callback_host; if(input_complex == 1) { ASSERT_EQ(hipMemcpyFromSymbol( &callback_host, HIP_SYMBOL(load_callback_short_dev), sizeof(void*)), hipSuccess); } else { ASSERT_EQ(hipMemcpyFromSymbol( &callback_host, HIP_SYMBOL(load_callback_short2_dev), sizeof(void*)), hipSuccess); } ASSERT_EQ(params.set_callbacks(callback_host, nullptr, nullptr, nullptr), fft_status_success); // run rocFFT void* gpu_input_ptr = gpu_input.data(); void* gpu_output_ptr = gpu_output.data(); ASSERT_EQ(params.execute(&gpu_input_ptr, &gpu_output_ptr), fft_status_success); // construct + run FFTW plan auto cpu_plan = fftw_plan_via_rocfft(params.length, params.istride, params.ostride, params.nbatch, params.idist, params.odist, params.transform_type, cpu_input, cpu_output); fftw_run(params.transform_type, cpu_plan, cpu_input, cpu_output); // copy rocFFT output back to CPU std::vector gpu_output_copy(1); gpu_output_copy[0].alloc(gpu_output.size()); ASSERT_EQ(hipMemcpy(gpu_output_copy[0].data(), gpu_output.data(), gpu_output.size(), hipMemcpyDeviceToHost), hipSuccess); auto cpu_output_norm = norm(cpu_output, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2)); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf)); auto gpu_output_norm = norm(gpu_output_copy, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); ASSERT_TRUE(std::isfinite(gpu_output_norm.l_2)); ASSERT_TRUE(std::isfinite(gpu_output_norm.l_inf)); double linf_cutoff = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(params.length.front()); auto diff = distance(cpu_output, gpu_output_copy, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.otype, params.ostride, params.odist, nullptr, linf_cutoff, params.ioffset, params.ooffset); ASSERT_TRUE(diff.l_inf <= linf_cutoff); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } rocFFT-rocm-6.4.3/clients/tests/cmake/000077500000000000000000000000001501537341300174725ustar00rootroot00000000000000rocFFT-rocm-6.4.3/clients/tests/cmake/FindFFTW.cmake000066400000000000000000000114341501537341300220460ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# #if( FFTW_FIND_VERSION VERSION_LESS "3" ) # message( FFTW_FIND_VERION is ${FFTW_FIND_VERSION}) # message( FATAL_ERROR "FindFFTW can not configure versions less than FFTW 3.0.0" ) #endif( ) find_path(FFTW_INCLUDE_DIRS NAMES fftw3.h HINTS ${FFTW_ROOT}/include $ENV{FFTW_ROOT}/include PATHS /usr/include /usr/local/include ) mark_as_advanced( FFTW_INCLUDE_DIRS ) # message( STATUS "FFTW_FIND_COMPONENTS: ${FFTW_FIND_COMPONENTS}" ) # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" ) # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" ) include( CheckSymbolExists ) set( FFTW_LIBRARIES "" ) if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE ) find_library( FFTW_LIBRARIES_SINGLE NAMES fftw3f fftw3f-3 fftw3 fftw3-3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library single" ) mark_as_advanced( FFTW_LIBRARIES_SINGLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTWF_OMP_LIBRARY fftw3f_omp ) find_library( FFTWF_THREADS_LIBRARY fftw3f_threads ) if( FFTWF_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTWF_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) check_symbol_exists( fftwf_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( FFTW_FIND_REQUIRED_DOUBLE ) find_library( FFTW_LIBRARIES_DOUBLE NAMES fftw3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library double" ) mark_as_advanced( FFTW_LIBRARIES_DOUBLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTW_OMP_LIBRARY fftw3_omp ) find_library( FFTW_THREADS_LIBRARY fftw3_threads ) if( FFTW_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTW_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) check_symbol_exists( fftw_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( BUILD_FFTW OR FFTW_HAVE_SPRINT_PLAN ) target_compile_definitions( rocfft-test PUBLIC FFTW_HAVE_SPRINT_PLAN ) endif() include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES ) # assume the threads feature is always enabled on Windows, since it's # not a separate library there if( FFTW_FOUND AND WIN32 ) set( FFTW_MULTITHREAD TRUE ) endif() if( NOT FFTW_FOUND ) message( STATUS "FindFFTW could not find all of the following fftw libraries" ) message( STATUS "${FFTW_FIND_COMPONENTS}" ) else( ) message(STATUS "FindFFTW configured variables:" ) message(STATUS "FFTW_INCLUDE_DIRS: ${FFTW_INCLUDE_DIRS}" ) message(STATUS "FFTW_LIBRARIES: ${FFTW_LIBRARIES}" ) endif() rocFFT-rocm-6.4.3/clients/tests/default_callbacks_test.cpp000066400000000000000000000404311501537341300236020ustar00rootroot00000000000000// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include "../../shared/fftw_transform.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_params.h" #include "rocfft/rocfft.h" // ------------------------------------- // default load callback definitions // ------------------------------------- template __device__ T load_cb(T* data, size_t offset, void* cbdata, void* sharedMem) { return data[offset]; } __device__ auto load_cb_complex_double = load_cb>; __device__ auto load_cb_double = load_cb; __device__ auto load_cb_complex_float = load_cb>; __device__ auto load_cb_float = load_cb; // ------------------------------------- // default store callback definitions // ------------------------------------- template __device__ void store_cb(T* data, size_t offset, T element, void* cbdata, void* sharedMem) { data[offset] = element; } __device__ auto store_cb_complex_double = store_cb>; __device__ auto store_cb_double = store_cb; __device__ auto store_cb_complex_float = store_cb>; __device__ auto store_cb_float = store_cb; // ------------------------------------- // type traits definitions // ------------------------------------- template struct is_hip_complex { static const bool value = false; }; template <> struct is_hip_complex> { static const bool value = true; }; template <> struct is_hip_complex> { static const bool value = true; }; // ASAN introduces some problems with mixing library and client // callbacks, so skip these tests if it's enabled #ifdef ADDRESS_SANITIZER #define TEST_CALLBACK_CHECK_ASAN \ GTEST_SKIP() << "mixed library/client callbacks not supported for ASAN"; #else #define TEST_CALLBACK_CHECK_ASAN #endif // ------------------------------------- // test callbacks struct // ------------------------------------- enum struct DefaultCallbackType { LOAD, STORE, }; struct Test_Callback { Test_Callback(size_t _N, size_t _dim, rocfft_transform_type_e _frwd_transf_type, rocfft_precision_e _frwd_transf_precision, DefaultCallbackType _cb_type, uint32_t _seed) : N(_N) , dim(_dim) , fwrd_transf_type(_frwd_transf_type) , frwd_transf_precision(_frwd_transf_precision) , cb_type(_cb_type) , seed(_seed) { float low_bound_f = -1.0f, up_bound_f = 1.0f; double low_bound_d = -1.0, up_bound_d = 1.0; std::vector> h_mem_out_f2, h_mem_out_no_cb_f2; std::vector> h_mem_out_d2, h_mem_out_no_cb_d2; switch(fwrd_transf_type) { case rocfft_transform_type_complex_forward: { std::vector> h_mem_in_f2; std::vector> h_mem_in_d2; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f2, h_mem_out_f2, h_mem_out_no_cb_f2) : run(low_bound_d, up_bound_d, h_mem_in_d2, h_mem_out_d2, h_mem_out_no_cb_d2); break; } case rocfft_transform_type_real_forward: { std::vector h_mem_in_f; std::vector h_mem_in_d; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f, h_mem_out_f2, h_mem_out_no_cb_f2) : run(low_bound_d, up_bound_d, h_mem_in_d, h_mem_out_d2, h_mem_out_no_cb_d2); break; } default: break; } } size_t get_data_size() { // compute total data size size_t data_size = 1; for(size_t i = 0; i < dim; ++i) { data_size *= N; } return data_size; } template void run(Tbound low_bound, Tbound up_bound, std::vector& host_mem_in, std::vector& host_mem_out, std::vector& host_mem_out_no_cb) { auto data_sz = get_data_size(); if(cb_type == DefaultCallbackType::LOAD) set_load_callback(); else if(cb_type == DefaultCallbackType::STORE) set_store_callback(); host_mem_in.resize(data_sz); if constexpr(!is_hip_complex::value) init_data(low_bound, up_bound, host_mem_in); else init_data_complex(low_bound, up_bound, host_mem_in); if constexpr(!is_hip_complex::value) data_sz = (data_sz / 2) + 1; host_mem_out.resize(data_sz); forward_transform(true, host_mem_in, host_mem_out); host_mem_out_no_cb.resize(data_sz); forward_transform(false, host_mem_in, host_mem_out_no_cb); validate_test(host_mem_out, host_mem_out_no_cb); } template void init_data(const Tbound low_bound, const Tbound up_bound, std::vector& host_mem) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(low_bound, up_bound); for(size_t i = 0; i < host_mem.size(); i++) { host_mem[i] = dist(gen); } } template void init_data_complex(const Tbound low_bound, const Tbound up_bound, std::vector& host_mem) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(low_bound, up_bound); for(size_t i = 0; i < host_mem.size(); i++) { host_mem[i].x = dist(gen); host_mem[i].y = dist(gen); } } template void forward_transform(bool apply_callback, const std::vector& host_mem_in, std::vector& host_mem_out) { rocfft_plan plan = nullptr; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, fwrd_transf_type, frwd_transf_precision, dim, lengths.data(), 1, nullptr), rocfft_status_success); size_t work_buffer_size = 0; void* work_buffer = nullptr; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); if(work_buffer_size) { ASSERT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); } hipStream_wrapper_t stream; stream.alloc(); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); if(apply_callback) { if(cb_type == DefaultCallbackType::LOAD) { ASSERT_EQ(rocfft_execution_info_set_load_callback(info, &load_cb_host, nullptr, 0), rocfft_status_success); } else if(cb_type == DefaultCallbackType::STORE) { ASSERT_EQ( rocfft_execution_info_set_store_callback(info, &store_cb_host, nullptr, 0), rocfft_status_success); } } gpubuf device_mem_in; size_t NbytesIn = host_mem_in.size() * sizeof(Tin); ASSERT_EQ(device_mem_in.alloc(NbytesIn), hipSuccess); EXPECT_EQ( hipMemcpy(device_mem_in.data(), host_mem_in.data(), NbytesIn, hipMemcpyHostToDevice), hipSuccess); gpubuf device_mem_out; size_t NbytesOut = host_mem_out.size() * sizeof(Tout); ASSERT_EQ(device_mem_out.alloc(NbytesOut), hipSuccess); void* in_ptr = device_mem_in.data(); void* out_ptr = device_mem_out.data(); ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); ASSERT_EQ(hipMemcpy(host_mem_out.data(), out_ptr, NbytesOut, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); ASSERT_EQ(hipFree(work_buffer), hipSuccess); } template void validate_test(const std::vector& host_mem_out, const std::vector& host_mem_out_no_cb) { auto diff = distance_1to1_complex( reinterpret_cast*>(host_mem_out.data()), reinterpret_cast*>(host_mem_out_no_cb.data()), host_mem_out.size(), 1, 1, host_mem_out.size(), 1, host_mem_out_no_cb.size(), nullptr, type_epsilon(), {0}, {0}); EXPECT_LT(diff.l_inf, type_epsilon()); } // ------------------------------------------------ // set_load_callback template specializations // ------------------------------------------------ template void set_load_callback(){}; template <> void set_load_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_double), sizeof(void*)), hipSuccess); }; template <> void set_load_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_double), sizeof(void*)), hipSuccess); }; template <> void set_load_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_float), sizeof(void*)), hipSuccess); }; template <> void set_load_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_float), sizeof(void*)), hipSuccess); }; // ------------------------------------------------ // set_store_callback template specializations // ------------------------------------------------ template void set_store_callback(){}; template <> void set_store_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_double), sizeof(void*)), hipSuccess); }; template <> void set_store_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_double), sizeof(void*)), hipSuccess); }; template <> void set_store_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_float), sizeof(void*)), hipSuccess); }; template <> void set_store_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_float), sizeof(void*)), hipSuccess); }; size_t N = 0; size_t dim = 0; rocfft_transform_type_e fwrd_transf_type; rocfft_precision_e frwd_transf_precision; DefaultCallbackType cb_type; uint32_t seed = 0; void* store_cb_host = nullptr; void* load_cb_host = nullptr; }; // ------------------------------------------------------------------- // Test forward transforms in single/double precision with real and // complex data inputs and having only a load callback set. // ------------------------------------------------------------------- TEST(rocfft_UnitTest, default_load_callback_complex_single) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(256, 1, rocfft_transform_type_complex_forward, rocfft_precision_single, DefaultCallbackType::LOAD, 1); } TEST(rocfft_UnitTest, default_load_callback_complex_double) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(512, 1, rocfft_transform_type_complex_forward, rocfft_precision_double, DefaultCallbackType::LOAD, 2); } TEST(rocfft_UnitTest, default_load_callback_real_single) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(1024, 1, rocfft_transform_type_real_forward, rocfft_precision_single, DefaultCallbackType::LOAD, 3); } TEST(rocfft_UnitTest, default_load_callback_real_double) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(2048, 1, rocfft_transform_type_real_forward, rocfft_precision_double, DefaultCallbackType::LOAD, 4); } // ------------------------------------------------------------------- // Test forward transforms in single/double precision with real and // complex data inputs and having only a store callback set. // ------------------------------------------------------------------- TEST(rocfft_UnitTest, default_store_callback_complex_single) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(256, 1, rocfft_transform_type_complex_forward, rocfft_precision_single, DefaultCallbackType::STORE, 5); } TEST(rocfft_UnitTest, default_store_callback_complex_double) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(512, 1, rocfft_transform_type_complex_forward, rocfft_precision_double, DefaultCallbackType::STORE, 6); } TEST(rocfft_UnitTest, default_store_callback_real_single) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(1024, 1, rocfft_transform_type_real_forward, rocfft_precision_single, DefaultCallbackType::STORE, 7); } TEST(rocfft_UnitTest, default_store_callback_real_double) { TEST_CALLBACK_CHECK_ASAN; Test_Callback test(2048, 1, rocfft_transform_type_real_forward, rocfft_precision_double, DefaultCallbackType::STORE, 8); } rocFFT-rocm-6.4.3/clients/tests/gtest_main.cpp000066400000000000000000000745601501537341300212640ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /// @file /// @brief googletest based unit tester for rocfft /// #include #include #include #include #include #include #include #include #include #include #include #include #include "../../shared/CLI11.hpp" #include "../../shared/concurrency.h" #include "../../shared/device_properties.h" #include "../../shared/environment.h" #include "../../shared/hostbuf.h" #include "../../shared/rocfft_accuracy_test.h" #include "../../shared/sys_mem.h" #include "../../shared/test_params.h" #include "../../shared/work_queue.h" #include "bitwise_repro/bitwise_repro_db.h" #include "bitwise_repro/bitwise_repro_test.h" #include "rocfft/rocfft.h" // Control output verbosity: int verbose; // User-defined random seed size_t random_seed; // Overall probability of running conventional tests double test_prob; // Probability of running tests from the emulation suite double emulation_prob; // Modifier for probability of running tests with complex interleaved data double complex_interleaved_prob_factor; // Modifier for probability of running tests with real data double real_prob_factor; // Modifier for probability of running tests with complex planar data double complex_planar_prob_factor; // Modifier for probability of running tests with callbacks double callback_prob_factor; // Number of random tests per suite size_t n_random_tests = 0; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(bitwise_repro_test); // Transform parameters for manual test: fft_params manual_params; // Host memory limitation for tests (GiB): size_t ramgb; // Device memory limitation for tests (GiB): size_t vramgb; // Allow skipping tests if there is a runtime error bool skip_runtime_fails; // But count the number of failures int n_hip_failures = 0; // Pointer to a bitwise repro-db file std::unique_ptr repro_db; // Manually specified precision cutoffs: double half_epsilon; double single_epsilon; double double_epsilon; // Measured precision cutoffs: double max_linf_eps_double = 0.0; double max_l2_eps_double = 0.0; double max_linf_eps_single = 0.0; double max_l2_eps_single = 0.0; double max_linf_eps_half = 0.0; double max_l2_eps_half = 0.0; // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE). bool use_fftw_wisdom = false; // Compare results against FFTW in accuracy tests bool fftw_compare = true; // Cache the last cpu fft that was requested last_cpu_fft_cache last_cpu_fft_data; // Number of devices to distribute the FFT to for manual tests int manual_devices = 1; // Multi-process library to use fft_params::fft_mp_lib mp_lib = fft_params::fft_mp_lib_none; // Number of multi-process ranks to launch int mp_ranks = 1; // Multi-process launch command (e.g. mpirun --np 4 /path/to/rocfft_mpi_worker) std::string mp_launch; void init_gtest_flags() { // HACK: gtest maintains a "should run" flag on each test case, // but only sets it during RUN_ALL_TESTS. Precompiling should // ideally only happen for the test cases that would actually // run. // // So call RUN_ALL_TESTS once with the "list tests" temporarily set // to true, to initialize all of that. // // gtest will then print all of the test cases to stdout. // Temporarily redirect stdout to /dev/null as well. bool temp_list_tests = true; std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); // move stdout to devnull #ifdef WIN32 int stdout_fd = _fileno(stdout); int devnull = _open("NUL", _O_WRONLY); int stdout_copy = _dup(stdout_fd); _dup2(devnull, stdout_fd); #else int stdout_fd = STDOUT_FILENO; int devnull = open("/dev/null", O_WRONLY); int stdout_copy = dup(stdout_fd); dup2(devnull, stdout_fd); #endif (void)RUN_ALL_TESTS(); // put stdout back #ifdef WIN32 _dup2(stdout_copy, stdout_fd); _close(stdout_copy); _close(devnull); #else dup2(stdout_copy, stdout_fd); close(stdout_copy); close(devnull); #endif std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); } void precompile_test_kernels(const std::string& precompile_file) { std::cout << "precompiling test kernels...\n"; WorkQueue tokenQueue; init_gtest_flags(); std::vector tokens; auto ut = testing::UnitTest::GetInstance(); for(int ts_index = 0; ts_index < ut->total_test_suite_count(); ++ts_index) { const auto ts = ut->GetTestSuite(ts_index); for(int ti_index = 0; ti_index < ts->total_test_count(); ++ti_index) { const auto ti = ts->GetTestInfo(ti_index); std::string name = ti->name(); // only precompile test cases that will run if(!ti->should_run()) continue; // only care about accuracy tests if(name.find("vs_fftw/") != std::string::npos) { name.erase(0, 8); // Run any problem that uses brick decomposition // without touching batch. Bricks are specified with // batch indexes, so arbitrarily changing batch to 1 // can break those cases. if(name.find("_brick_") != std::string::npos) { tokens.emplace_back(std::move(name)); continue; } // change batch to 1, so we don't waste time creating // multiple plans that differ only by batch auto idx = name.find("_batch_"); if(idx == std::string::npos) continue; // advance idx to batch number idx += 7; auto end = name.find('_', idx); if(end == std::string::npos) continue; name.replace(idx, end - idx, "1"); tokens.emplace_back(std::move(name)); } } } std::random_device dev; std::mt19937 dist(dev()); std::shuffle(tokens.begin(), tokens.end(), dist); auto precompile_begin = std::chrono::steady_clock::now(); std::cout << "precompiling " << tokens.size() << " FFT plans...\n"; for(auto&& t : tokens) tokenQueue.push(std::move(t)); EnvironmentSetTemp env_compile_only{"ROCFFT_INTERNAL_COMPILE_ONLY", "1"}; const size_t NUM_THREADS = rocfft_concurrency(); std::vector threads; for(size_t i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([&tokenQueue]() { for(;;) { std::string token{tokenQueue.pop()}; if(token.empty()) break; try { rocfft_params params_forward; params_forward.from_token(token); params_forward.validate(); params_forward.setup_structs(); params_forward.free(); rocfft_params params_inverse; params_inverse.inverse_from_forward(params_forward); params_inverse.validate(); params_inverse.setup_structs(); } catch(std::exception& e) { // failed to create a plan, abort // // we could continue on, but the test should just // fail later anyway in the same way. so report // which token failed early and get out throw std::runtime_error(token + " plan creation failure: " + e.what()); } } }); // insert empty tokens to tell threads to stop tokenQueue.push({}); } for(auto& t : threads) t.join(); auto precompile_end = std::chrono::steady_clock::now(); std::chrono::duration precompile_ms = precompile_end - precompile_begin; std::cout << "done precompiling FFT plans in " << static_cast(precompile_ms.count()) << " ms\n"; } int main(int argc, char* argv[]) { const auto test_begin = std::chrono::system_clock::now(); // We would like to parse a few arguments before initiating gtest. // Save argv[0] because CLI doesn't include this in the remaining args, and it's expected when // we re-parse the arguments with gtest and CLI. std::string argv0 = argv[0]; CLI::App app{ "\n" "rocFFT Runtime Test command line options\n" "NB: input parameters are row-major.\n" "\n" "FFTW accuracy test cases are named using these identifiers:\n" "\n" " len_: problem dimensions, row-major\n" " single,double: precision\n" " ip,op: in-place or out-of-place\n" " batch_: batch size\n" " istride__: input stride (ostride for output stride), format may be:\n" " CI - complex interleaved\n" " CP - complex planar\n" " R - real\n" " HI - hermitian interleaved\n" " HP - hermitian planar\n" "\n" "Usage"}; // Override CLI11 help to print after later CLI11 options that are defined, and allow gtest's // help. // After removing the stage-1 options, individual options are set to null (even if set), but we // can still capture the behaviour by using a flag. for(auto opt : app.get_options()) { app.remove_option(opt); } app.add_option("-v, --verbose", verbose, "Print out detailed information for the tests") ->default_val(0); app.add_option("--nrand", n_random_tests, "Number of extra randomized tests")->default_val(0); app.add_option("--test_prob", test_prob, "Probability of running individual tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option( "--emulation_prob", test_prob, "Probability of running individual emulation tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option("--real_prob", real_prob_factor, "Probability multiplier for running individual real/complex transforms") ->default_val(1.0) ->check(CLI::PositiveNumber); app.add_option("--planar_prob", complex_planar_prob_factor, "Probability multiplier for running individual planar transforms") ->default_val(0.1) ->check(CLI::PositiveNumber); app.add_option( "--complex_interleaved_prob_factor", complex_interleaved_prob_factor, "Probability multiplier for running individual transforms with complex interleaved data") ->default_val(1) ->check(CLI::PositiveNumber); app.add_option("--callback_prob", callback_prob_factor, "Probability multiplier for running individual callback transforms") ->default_val(0.1) ->check(CLI::PositiveNumber); constexpr std::array emulation_types = {"none", "smoke", "regression", "extended"}; app.add_option("--emulation", "Run emulation tests") ->check(CLI::IsMember(emulation_types)) ->each([&](const std::string& emulationtype) { constexpr auto nidx = [emulation_types](const auto name) { return std::find(emulation_types.begin(), emulation_types.end(), name) - emulation_types.begin(); }; // Emulation test suites focus on well-established software paths; we are looking for // information about the hardware, which means that we aren't trying to find out a lot // of information about the software. Thus, no randomly-generated tests. n_random_tests = 0; // Run all of the emulation tests: emulation_prob = 1.0; // Callbacks are not an emulation test target. callback_prob_factor = 0; // We can do a switch on nidx(emulationtype) when we have C++20 // switch(nidx(emulationtype)) // { // case nidx("smoke"): // etc. if(nidx(emulationtype) == nidx("smoke")) { // 2GB vram limit, approx 1 minute GPU time with short tests. vramgb = 2; test_prob = 0; emulation_prob = 0.005; } if(nidx(emulationtype) == nidx("regression")) { vramgb = 16; emulation_prob = 1; test_prob = 0.01; } if(nidx(emulationtype) == nidx("extended")) { emulation_prob = 1; test_prob = 0.02; } }); app.add_option("--fftw_compare", fftw_compare, "Compare to FFTW in accuracy tests") ->default_val(true); app.add_option("--mp_lib", mp_lib, "Multi-process library type: none (default), mpi") ->default_val("none"); app.add_option("--mp_ranks", mp_ranks, "Number of multi-process ranks to launch") ->default_val(1) ->check(CLI::NonNegativeNumber); app.add_option("--mp_launch", mp_launch, "Command line prefix to launch multi-process transforms, e.g. \"mpirun --np 4 " "/path/to/rocfft_mpi_worker\"") ->default_val("") ->each([&](const std::string&) { if(mp_lib == fft_params::fft_mp_lib_none) { std::cout << "--mp_launch requires an mp library (see mp_lib in --help).\n"; std::exit(-1); } }) ->needs("--mp_lib"); app.add_flag("--smoketest", "Run a short (approx 5 minute) randomized selection of tests") ->each([&](const std::string&) { // The objective is to have an test that takes about 5 minutes, so just set the // probability per test to a small value to achieve this result. test_prob = 0.001; emulation_prob = 0.01; n_random_tests = 10; }); app.add_flag("--callback", "Inject load/store callbacks")->each([&](const std::string&) { manual_params.run_callbacks = true; }); { // We explicitly scope opt_seed so that the object falls out of scope before the final // parsing of the command line arguments. Otherwise, the second parsing would mark the // option as not having been specified, which can get rather confusing. auto opt_seed = app.add_option( "--seed", random_seed, "Random seed; if unset, use an actual random seed"); // Try parsing initial args that will be used to configure tests. // Allow extras to pass on gtest and rocFFT arguments without error. app.allow_extras(); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(!*opt_seed) { std::cout << "Generating random seed: "; std::random_device dev; random_seed = dev(); std::cout << random_seed << "\n"; } } app.set_help_flag(""); auto opt_help = app.add_flag("-h, --help", "Produces this help message"); std::vector remaining_args = app.remaining(); // Google test ignores the first element, so add something there so that it parses all of hte // arguments that we want it to parse.: remaining_args.insert(remaining_args.begin(), argv0); // NB: If we initialize gtest first, then it removes all of its own command-line // arguments and sets argc and argv correctly; std::vector carg; for(std::string& s : remaining_args) { carg.push_back(&s[0]); } carg.push_back(NULL); decltype(argc) cargc = carg.size() - 1; ::testing::InitGoogleTest(&cargc, carg.data()); // Filename for fftw and fftwf wisdom. std::string fftw_wisdom_filename; // Token string to fully specify fft params for the manual test. std::string test_token; // Filename for precompiled kernels to be written to std::string precompile_file; // Full path to bitwise repro database file std::string repro_db_path; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("--version", "Print queryable version information from the rocfft library") ->each([](const std::string&) { char v[256]; rocfft_get_version_string(v, 256); std::cout << "version " << v << std::endl; return EXIT_SUCCESS; }); app.add_flag("--checkstride", "Check that data is not written outside of output strides") ->each([&](const std::string&) { manual_params.check_output_strides = true; }); auto opt_token = app.add_option("--token", test_token, "Test token name for manual test")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { manual_params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", manual_params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option("--precision", manual_params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { manual_params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", manual_params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", manual_params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token->add_option("--length", manual_params.length, "Lengths")->expected(1, 3); non_token ->add_option("-b, --batchSize", manual_params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); non_token->add_option("--istride", manual_params.istride, "Input stride"); non_token->add_option("--ostride", manual_params.ostride, "Output stride"); non_token->add_option("--idist", manual_params.idist, "Logical distance between input batches") ->default_val(0); non_token->add_option("--odist", manual_params.odist, "Logical distance between output batches") ->default_val(0); non_token->add_option("--ioffset", manual_params.ioffset, "Input offset"); non_token->add_option("--ooffset", manual_params.ooffset, "Output offset"); app.add_option("--isize", manual_params.isize, "Logical size of input buffer"); app.add_option("--osize", manual_params.osize, "Logical size of output buffer"); app.add_option("--R", ramgb, "RAM limit in GiB for tests") ->default_val(host_memory::singleton().get_total_gbytes()); app.add_option("--V", vramgb, "VRAM limit in GiB for tests")->default_val(0); app.add_option("--half_epsilon", half_epsilon)->default_val(9.77e-4); app.add_option("--single_epsilon", single_epsilon)->default_val(3.75e-5); app.add_option("--double_epsilon", double_epsilon)->default_val(1e-15); app.add_option("--skip_runtime_fails", skip_runtime_fails, "Skip the test if there is a runtime failure") ->default_val(true); app.add_option("-w, --wise", use_fftw_wisdom, "Use FFTW wisdom"); app.add_option("-W, --wisdomfile", fftw_wisdom_filename, "FFTW3 wisdom filename") ->default_val("wisdom3.txt"); app.add_option("--manual_devices", manual_devices, "Distribute manual test case among this many devices") ->default_val(1) ->check(CLI::PositiveNumber); app.add_option("--scalefactor", manual_params.scale_factor, "Scale factor to apply to output"); app.add_option("--repro-db", repro_db_path, "Database file full path name for bitwise reproducibility tests"); app.add_option("--precompile", precompile_file, "Precompile kernels to a file for all test cases before running tests") ->default_val(""); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", manual_params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); // Parse rest of args and catch any errors here try { app.parse(cargc, carg.data()); } catch(const CLI::ParseError& e) { return app.exit(e); } if(*opt_help) { std::cout << app.help() << "\n"; return EXIT_SUCCESS; } // Ensure there are no leftover options used by neither gtest nor CLI11 const auto leftover_args = app.remaining(); if(!leftover_args.empty()) { std::cout << "Unrecognised option(s) found:\n "; for(auto i : leftover_args) std::cout << i << " "; std::cout << "\nRun with --help for more information.\n"; return EXIT_FAILURE; } std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon << "\n"; std::cout << "Random seed: " << random_seed << "\n"; // If precompiling, tell rocFFT to use the specified cache file // to write kernels to // // But if our environment already has a cache file for RTC, then // we should just use that std::unique_ptr env_precompile; if(!precompile_file.empty() && rocfft_getenv("ROCFFT_RTC_CACHE_PATH").empty()) { env_precompile = std::make_unique("ROCFFT_RTC_CACHE_PATH", precompile_file.c_str()); } rocfft_setup(); char v[256]; rocfft_get_version_string(v, 256); std::cout << "rocFFT version: " << v << "\n"; #ifdef FFTW_MULTITHREAD fftw_init_threads(); fftwf_init_threads(); fftw_plan_with_nthreads(rocfft_concurrency()); fftwf_plan_with_nthreads(rocfft_concurrency()); #endif // Set host memory limit from command-line options host_memory::singleton().set_limit_gbytes(ramgb); if(use_fftw_wisdom) { if(verbose) { std::cout << "Using " << fftw_wisdom_filename << " wisdom file\n"; } std::ifstream fftw_wisdom_file(fftw_wisdom_filename); std::string allwisdom = std::string(std::istreambuf_iterator(fftw_wisdom_file), std::istreambuf_iterator()); std::string fftw_wisdom; std::string fftwf_wisdom; bool load_wisdom = false; bool load_fwisdom = false; std::istringstream input; input.str(allwisdom); // Separate the single-precision and double-precision wisdom: for(std::string line; std::getline(input, line);) { if(line.rfind("(fftw", 0) == 0 && line.find("fftw_wisdom") != std::string::npos) { load_wisdom = true; } if(line.rfind("(fftw", 0) == 0 && line.find("fftwf_wisdom") != std::string::npos) { load_fwisdom = true; } if(load_wisdom) { fftw_wisdom.append(line + "\n"); } if(load_fwisdom) { fftwf_wisdom.append(line + "\n"); } if(line.rfind(")", 0) == 0) { load_wisdom = false; load_fwisdom = false; } } fftw_import_wisdom_from_string(fftw_wisdom.c_str()); fftwf_import_wisdom_from_string(fftwf_wisdom.c_str()); } if(!repro_db_path.empty()) repro_db.reset(new fft_hash_db(repro_db_path)); if(!test_token.empty()) { std::cout << "Reading fft params from token:\n" << test_token << "\n"; try { manual_params.from_token(test_token); } catch(...) { std::cout << "Unable to parse token.\n"; return 1; } } else { if(manual_params.length.empty()) { manual_params.length.push_back(8); // TODO: add random size? } if(manual_params.istride.empty()) { manual_params.istride.push_back(1); // TODO: add random size? } if(manual_params.ostride.empty()) { manual_params.ostride.push_back(1); // TODO: add random size? } } if(!precompile_file.empty()) precompile_test_kernels(precompile_file); auto retval = RUN_ALL_TESTS(); if(use_fftw_wisdom) { std::string fftw_wisdom = std::string(fftw_export_wisdom_to_string()); std::string fftwf_wisdom = std::string(fftwf_export_wisdom_to_string()); fftw_wisdom.append(std::string(fftwf_export_wisdom_to_string())); std::ofstream fftw_wisdom_file(fftw_wisdom_filename); fftw_wisdom_file << fftw_wisdom; fftw_wisdom_file << fftwf_wisdom; fftw_wisdom_file.close(); } rocfft_cleanup(); const auto test_duration = std::chrono::system_clock::now() - test_begin; const auto test_hours = std::chrono::duration_cast(test_duration); const auto test_minutes = std::chrono::duration_cast(test_duration - test_hours); std::cout << "Test suite took " << test_hours.count() << " hours " << test_minutes.count() << " minutes\n\n"; std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n"; std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << "\n"; std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << "\n"; std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << "\n"; std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n"; std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << "\n"; std::cout << "Number of runtime issues: " << n_hip_failures << "\n"; std::cout << "\nRandom seed: " << random_seed << "\n"; return retval; } TEST(manual, vs_fftw) // MANUAL TESTS HERE { rocfft_params params(manual_params); if(manual_devices > 1) { // just distribute along the slowest FFT dimension std::vector deviceGrid(params.length.size() + 1, 1); deviceGrid[1] = manual_devices; params.distribute_input(manual_devices, deviceGrid); params.distribute_output(manual_devices, deviceGrid); } // Run an individual test using the provided command-line parameters. params.validate(); std::cout << "Manual test:" << "\n\t" << params.str("\n\t") << "\n"; std::cout << "Token: " << params.token() << "\n"; if(!params.valid(verbose + 2)) { std::cout << "manual params are not valid\n"; } try { fft_vs_reference(params); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear test cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } } TEST(manual, bitwise_reproducibility) // MANUAL TESTS HERE { if(repro_db == nullptr) GTEST_SKIP() << "A database file is required for this test." << std::endl; rocfft_params params(manual_params); // Run an individual test using the provided command-line parameters. params.validate(); std::cout << "Manual test:" << "\n\t" << params.str("\n\t") << "\n"; std::cout << "Token: " << params.token() << "\n"; if(!params.valid(verbose + 2)) { std::cout << "manual params are not valid\n"; } try { bitwise_repro(params); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } rocFFT-rocm-6.4.3/clients/tests/hermitian_test.cpp000066400000000000000000000263621501537341300221460ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_params.h" #include "../samples/rocfft/examplekernels.h" #include "../samples/rocfft/exampleutils.h" #include "rocfft/rocfft.h" #include #include #include #include #include #include #include void run_1D_hermitian_test(size_t length) { // Run two 1D C2R transforms, on: // * random input // * identical random input, but modified to be Hermitian-symmetric // We should tolerate the input being having non-zero imaginary part in the DC mode // and the Nyquist frequency (of the length is even). rocfft_params p; p.length = {length}; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_inverse; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); std::vector h_input(p.isize[0]); std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dis(0.0, 1.0); for(auto& val : h_input) { val.x = dis(gen); val.y = dis(gen); } if(verbose > 2) { std::cout << "non-Hermitian input:"; for(const auto& val : h_input) { std::cout << " " << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } gpubuf ibuf; ASSERT_TRUE(ibuf.alloc(p.ibuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(hipMemcpy(ibuf.data(), h_input.data(), ibuf.size(), hipMemcpyHostToDevice) == hipSuccess); gpubuf obuf; ASSERT_TRUE(obuf.alloc(p.obuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(p.create_plan() == fft_status_success); std::vector pibuf = {ibuf.data()}; std::vector pobuf = {obuf.data()}; ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output(p.osize[0]); ASSERT_TRUE(hipMemcpy(h_output.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); ASSERT_TRUE(hipDeviceSynchronize() == hipSuccess); if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output) { std::cout << " " << val; } std::cout << std::endl; } std::vector h_input1(p.isize[0]); std::copy(h_input.begin(), h_input.end(), h_input1.begin()); // Impose Hermitian symmetry on the input: h_input1[0].y = 0.0; if(p.length[0] % 2 == 0) { h_input1.back().y = 0.0; } if(verbose > 2) { std::cout << "Hermitian input:"; for(const auto& val : h_input1) { std::cout << " " << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } double maxdiff = 0.0; for(unsigned int i = 0; i < h_input.size(); ++i) { auto val = std::abs( rocfft_complex(h_input[i].x - h_input1[i].x, h_input[i].y - h_input1[i].y)); if(val > maxdiff) maxdiff = val; } ASSERT_TRUE(maxdiff > 0.0); ASSERT_TRUE(hipMemcpy(ibuf.data(), h_input1.data(), ibuf.size(), hipMemcpyHostToDevice) == hipSuccess); ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output1(p.osize[0]); ASSERT_TRUE(hipMemcpy(h_output1.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output1) { std::cout << " " << val; } std::cout << std::endl; } double maxerr = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { auto val = std::abs(h_output[i] - h_output1[i]); if(val > maxerr) maxerr = val; } if(verbose) std::cout << maxerr << std::endl; EXPECT_TRUE(maxerr == 0.0); } // test a case that's small enough that it only needs one kernel TEST(rocfft_UnitTest, 1D_hermitian_single_small) { run_1D_hermitian_test(8); } // test a case that's big enough that it needs multiple kernels TEST(rocfft_UnitTest, 1D_hermitian_single_large) { run_1D_hermitian_test(8192); } template std::string str(T begin, T end) { std::stringstream ss; bool first = true; for(; begin != end; begin++) { if(!first) ss << ", "; ss << *begin; first = false; } return ss.str(); } // Test that the GPU Hermitian symmetrizer code produces the correct results. TEST(rocfft_UnitTest, gpu_symmetrizer) { std::vector> lengths = {{4, 4, 3}, {5}, {8}, {5, 5}, {5, 8}, {8, 5}, {8, 8}, {5, 5, 5}, {8, 5, 5}, {5, 8, 5}, {5, 5, 8}, {5, 8, 8}, {8, 5, 8}, {8, 8, 5}, {8, 8, 8}}; for(const auto& length : lengths) { // Symmetrize complex data and ensure that the checker sees that it's symmetric. // Use the params class to set up strides and lengths: rocfft_params p; p.length = length; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_inverse; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << "\t" << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); // Data buffers: gpubuf buf; ASSERT_TRUE(buf.alloc(sizeof(hipDoubleComplex) * p.isize[0]) == hipSuccess); std::vector hbuf(p.isize[0]); // Initialize a Hermitian-symmetric array; it should be symmetric. init_hermitiancomplex_cm(p.length_cm(), p.ilength_cm(), p.istride_cm(), buf.data()); ASSERT_TRUE(hipMemcpy(hbuf.data(), buf.data(), buf.size(), hipMemcpyDeviceToHost) == hipSuccess); if(verbose > 1) { printbuffer_cm(hbuf, p.ilength_cm(), p.istride_cm(), p.nbatch, p.idist); } EXPECT_TRUE( check_symmetry_cm(hbuf, p.length_cm(), p.istride_cm(), p.nbatch, p.idist, verbose > 0)) << "length: " << str(length.begin(), length.end()); // This should not be symmetric: std::mt19937_64 rng; std::seed_seq ss{uint32_t(10)}; rng.seed(ss); std::uniform_real_distribution unif(0, 1); for(auto& v : hbuf) { v.x = unif(rng); v.y = unif(rng); } if(verbose > 2) { printbuffer_cm(hbuf, p.ilength_cm(), p.istride_cm(), p.nbatch, p.idist); } EXPECT_TRUE( !check_symmetry_cm(hbuf, p.length_cm(), p.istride_cm(), p.nbatch, p.idist, false)) << "length: " << str(length.begin(), length.end()); } for(const auto& length : lengths) { // Generate Hermitian-symmetric data and ensure that applying the symmetrizer has no effect. rocfft_params p; p.length = length; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_forward; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << "\t" << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); ASSERT_TRUE(p.create_plan() == fft_status_success); gpubuf ibuf, obuf; ASSERT_TRUE(ibuf.alloc(p.ibuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(obuf.alloc(p.obuffer_sizes()[0]) == hipSuccess); initreal_cm(p.length_cm(), p.istride_cm(), ibuf.data()); std::vector pibuf = {ibuf.data()}; std::vector pobuf = {obuf.data()}; ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output(p.osize[0]); std::fill(h_output.begin(), h_output.end(), hipDoubleComplex{0.0, 0.0}); ASSERT_TRUE( hipMemcpy(h_output.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); impose_hermitian_symmetry_cm(p.length_cm(), p.olength_cm(), p.ostride_cm(), obuf.data()); std::vector h_output_resym(p.osize[0]); std::fill(h_output_resym.begin(), h_output_resym.end(), hipDoubleComplex{0.0, 0.0}); ASSERT_TRUE( hipMemcpy( h_output_resym.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); double maxdiff = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { auto rdiff = std::abs(h_output[i].x - h_output_resym[i].x); auto idiff = std::abs(h_output[i].y - h_output_resym[i].y); maxdiff = std::max({maxdiff, rdiff, idiff}); } if(verbose) { std::cout << "maxdiff: " << maxdiff << std::endl; } if(verbose > 2) { std::cout << "before symmetrization:\n"; printbuffer_cm(h_output, p.olength_cm(), p.ostride_cm(), p.nbatch, p.odist); std::cout << "after symmetrization:\n"; printbuffer_cm(h_output_resym, p.olength_cm(), p.ostride_cm(), p.nbatch, p.odist); } EXPECT_TRUE(maxdiff < 1e-13) << maxdiff << "\n" << p.str() << "\n"; } } rocFFT-rocm-6.4.3/clients/tests/hipGraph_test.cpp000066400000000000000000000332701501537341300217240ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/arithmetic.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/rocfft_params.h" #include "rocfft/rocfft.h" #include #include #include #include #include static const unsigned int KERNEL_THREADS = 64; __global__ void scale_data_kernel(rocfft_complex* data, size_t length, float scale) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx].x *= scale; data[idx].y *= scale; } } template __global__ void offset_data_kernel_complex(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx].x += offset.x; data[idx].y += offset.y; } } template __global__ void offset_data_kernel_real(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx] += offset; } } static void init_input_data(size_t N, size_t seed, std::vector>& host_data, gpubuf_t>& device_data) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); host_data.resize(N); for(size_t i = 0; i < N; i++) { host_data[i].x = dist(gen); host_data[i].y = dist(gen); } size_t Nbytes = N * sizeof(rocfft_complex); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); EXPECT_EQ(hipMemcpy(device_data.data(), host_data.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } template static void init_data(size_t N, T init_val, std::vector& host_data, gpubuf_t& device_data) { host_data.resize(N); std::fill(host_data.begin(), host_data.end(), init_val); size_t Nbytes = N * sizeof(T); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); EXPECT_EQ(hipMemcpy(device_data.data(), host_data.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } static void create_forward_fft_plan(size_t N, rocfft_plan& plan) { auto dim = 1; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, rocfft_transform_type_complex_forward, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); } static void create_inverse_fft_plan(size_t N, rocfft_plan& plan_inv) { auto dim = 1; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan_inv, rocfft_placement_inplace, rocfft_transform_type_complex_inverse, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); } static void set_fft_info(hipStream_t stream, rocfft_execution_info& info) { EXPECT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); EXPECT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); } static void run_forward_fft(rocfft_execution_info info, const rocfft_plan plan, void* in_ptr, void* out_ptr) { ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); } static void run_inverse_fft(rocfft_execution_info info, const rocfft_plan plan_inv, void* in_ptr, void* out_ptr) { // Execute inverse plan in-place ASSERT_EQ(rocfft_execute(plan_inv, &in_ptr, &out_ptr, info), rocfft_status_success); } static void scale_device_data(hipStream_t stream, float scale, size_t N, rocfft_complex* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(scale_data_kernel, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, scale); } template static void offset_device_data_real(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(offset_data_kernel_real, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, offset); } template static void offset_device_data_complex(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(offset_data_kernel_complex, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, offset); } template static void compare_data_exact_match(hipStream_t other_stream, const std::vector& host_data, const gpubuf_t& device_data) { std::vector host_data_compare(host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpyAsync(host_data_compare.data(), device_data.data(), host_data_compare.size() * sizeof(T), hipMemcpyDeviceToHost, other_stream), hipSuccess); ASSERT_EQ(hipStreamSynchronize(other_stream), hipSuccess); ASSERT_EQ(host_data == host_data_compare, true); } static void compare_data(const std::vector>& original_host_data, const gpubuf_t>& modified_device_data) { std::vector> modified_host_data(original_host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpy(modified_host_data.data(), modified_device_data.data(), modified_host_data.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm = norm_complex(reinterpret_cast*>(original_host_data.data()), original_host_data.size(), 1, 1, original_host_data.size(), {0}); auto diff = distance_1to1_complex( reinterpret_cast*>(original_host_data.data()), reinterpret_cast*>(modified_host_data.data()), // data is all contiguous, we can treat it as 1d original_host_data.size(), 1, 1, original_host_data.size(), 1, modified_host_data.size(), nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); EXPECT_LT(diff.l_2 / input_norm.l_2, sqrt(log2(original_host_data.size())) * MAX_TRANSFORM_ERROR); EXPECT_LT(diff.l_inf / input_norm.l_inf, log2(original_host_data.size()) * MAX_TRANSFORM_ERROR); } TEST(rocfft_UnitTest, hipGraph_execution) { hipGraph_t graph = nullptr; hipGraphExec_t graph_exec = nullptr; size_t N = 256; size_t seed = 100; auto offset_1 = rocfft_complex{.1, .1}; auto offset_2 = rocfft_complex{-.1, -.1}; float scale = 2.2; float inv_scale = 1. / scale; auto output_init_val = rocfft_complex(0., 0.); size_t num_kernel_launches = 100; size_t num_graph_launches = 10; gpubuf_t> device_mem_in; std::vector> host_mem_in; init_input_data(N, seed, host_mem_in, device_mem_in); rocfft_complex* in_ptr = static_cast*>(device_mem_in.data()); gpubuf_t> device_mem_out; std::vector> host_mem_out; init_data>(N, output_init_val, host_mem_out, device_mem_out); rocfft_complex* out_ptr = static_cast*>(device_mem_out.data()); gpubuf_t device_mem_counter; std::vector host_mem_counter; init_data(N, 0, host_mem_counter, device_mem_counter); size_t* counter_ptr = static_cast(device_mem_counter.data()); rocfft_plan plan; create_forward_fft_plan(N, plan); rocfft_plan plan_inv; create_inverse_fft_plan(N, plan_inv); EXPECT_EQ(hipDeviceSynchronize(), hipSuccess); hipStream_wrapper_t stream; hipStream_wrapper_t other_stream; stream.alloc(); other_stream.alloc(); ASSERT_EQ(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal), hipSuccess); rocfft_execution_info info; set_fft_info(stream, info); // add offset to device input data for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_1, N, in_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_2, N, in_ptr); // scale the device input data scale_device_data(stream, scale, N, in_ptr); // backout the scale scale_device_data(stream, inv_scale, N, in_ptr); // run forward transform on input data run_forward_fft(info, plan, in_ptr, out_ptr); // scale the device output data scale_device_data(stream, scale, N, out_ptr); // backout the scale scale_device_data(stream, inv_scale, N, out_ptr); // run (in-place) inverse transform on output data run_inverse_fft(info, plan_inv, out_ptr, nullptr); // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform auto inv_scale_N = 1. / static_cast(N); scale_device_data(stream, inv_scale_N, N, out_ptr); // add offset to device output data for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_1, N, out_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_2, N, out_ptr); // increment counter offset_device_data_real(stream, 1, N, counter_ptr); ASSERT_EQ(hipStreamEndCapture(stream, &graph), hipSuccess); // make sure no actual work has been done for // the captured stream before graph execution compare_data_exact_match>(other_stream, host_mem_out, device_mem_out); ASSERT_EQ(hipGraphInstantiate(&graph_exec, graph, NULL, NULL, 0), hipSuccess); ASSERT_EQ(hipGraphDestroy(graph), hipSuccess); for(size_t i = 0; i < num_graph_launches; ++i) ASSERT_EQ(hipGraphLaunch(graph_exec, stream), hipSuccess); ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); ASSERT_EQ(hipStreamDestroy(stream), hipSuccess); // check for correctness of the output data compare_data(host_mem_in, device_mem_out); // check for correctness of the counter // incremented with multiple graph executions std::vector host_mem_counter_modified(N); fill(host_mem_counter_modified.begin(), host_mem_counter_modified.end(), num_graph_launches); compare_data_exact_match(other_stream, host_mem_counter_modified, device_mem_counter); ASSERT_EQ(hipStreamDestroy(other_stream), hipSuccess); } rocFFT-rocm-6.4.3/clients/tests/multi_device_test.cpp000066400000000000000000000261571501537341300226410ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_params.h" #include #include extern fft_params::fft_mp_lib mp_lib; extern int mp_ranks; static const std::vector> multi_gpu_sizes = { {256}, {256, 256}, {256, 256, 256}, }; enum SplitType { // split both input and output on slow FFT dimension SLOW_INOUT, // split only input on slow FFT dimension, output is not split SLOW_IN, // split only output on slow FFT dimension, input is not split SLOW_OUT, // split input on slow FFT dimension, and output on fast FFT dimension SLOW_IN_FAST_OUT, // 3D pencil decomposition - one dimension is contiguous on input // and another dimension contiguous on output, remaining dims are // both split PENCIL_3D, }; std::vector param_generator_multi_gpu(const SplitType type) { int localDeviceCount = 0; (void)hipGetDeviceCount(&localDeviceCount); // need multiple devices or multiprocessing to test anything if(localDeviceCount < 2 && mp_lib == fft_params::fft_mp_lib_none) return {}; // limit local device testing to 16 GPUs, as we have some // bottlenecks with larger device counts that unreasonably slow // down plan creation localDeviceCount = std::min(16, localDeviceCount); auto params_complex = param_generator_complex(test_prob, multi_gpu_sizes, precision_range_sp_dp, {4, 1}, stride_generator({{1}}), stride_generator({{1}}), {{0, 0}}, {{0, 0}}, {fft_placement_inplace, fft_placement_notinplace}, false); auto params_real = param_generator_real(test_prob, multi_gpu_sizes, precision_range_sp_dp, {4, 1}, stride_generator({{1}}), stride_generator({{1}}), {{0, 0}}, {{0, 0}}, {fft_placement_notinplace}, false); std::vector all_params; auto distribute_params = [=, &all_params](const std::vector& params) { int brickCount = mp_lib == fft_params::fft_mp_lib_none ? localDeviceCount : mp_ranks; for(auto& p : params) { // start with all-ones in grids std::vector input_grid(p.length.size() + 1, 1); std::vector output_grid(p.length.size() + 1, 1); auto p_dist = p; switch(type) { case SLOW_INOUT: input_grid[1] = brickCount; output_grid[1] = brickCount; break; case SLOW_IN: // this type only specifies input field and no output // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; input_grid[1] = brickCount; break; case SLOW_OUT: // this type only specifies output field and no input // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; output_grid[1] = brickCount; break; case SLOW_IN_FAST_OUT: // requires at least rank-2 FFT if(p.length.size() < 2) continue; input_grid[1] = brickCount; output_grid.back() = brickCount; break; case PENCIL_3D: // need at least 2 bricks per split dimension, or 4 devices. // also needs to be a 3D problem. if(brickCount < 4 || p.length.size() != 3) continue; // make fast dimension contiguous on input input_grid[1] = static_cast(sqrt(brickCount)); input_grid[2] = brickCount / input_grid[1]; // make middle dimension contiguous on output output_grid[1] = input_grid[1]; output_grid[3] = input_grid[2]; break; } p_dist.mp_lib = mp_lib; p_dist.distribute_input(localDeviceCount, input_grid); p_dist.distribute_output(localDeviceCount, output_grid); // "placement" flag is meaningless if exactly one of // input+output is a field. So just add those cases if // the flag is "out-of-place", since "in-place" is // exactly the same test case. if(p_dist.placement == fft_placement_inplace && p_dist.ifields.empty() != p_dist.ofields.empty()) continue; all_params.push_back(std::move(p_dist)); } }; distribute_params(params_complex); distribute_params(params_real); return all_params; } // split both input and output on slowest FFT dim INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT)), accuracy_test::TestName); // split slowest FFT dim only on input, or only on output INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_input_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_output_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT)), accuracy_test::TestName); // split input on slowest FFT and output on fastest, to minimize data // movement (only makes sense for rank-2 and higher FFTs) INSTANTIATE_TEST_SUITE_P(multi_gpu_slowin_fastout, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT)), accuracy_test::TestName); // 3D pencil decompositions INSTANTIATE_TEST_SUITE_P(multi_gpu_3d_pencils, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D)), accuracy_test::TestName); TEST(multi_gpu_validate, catch_validation_errors) { const auto all_split_types = { SLOW_INOUT, SLOW_IN, SLOW_OUT, SLOW_IN_FAST_OUT, PENCIL_3D, }; for(auto type : all_split_types) { // gather all of the multi-GPU test cases auto params = param_generator_multi_gpu(type); for(size_t i = 0; i < params.size(); ++i) { auto& param = params[i]; // this validation runs in rocfft-test itself and // multi-process libs are not initialized. if(param.mp_lib != fft_params::fft_mp_lib_none) continue; std::vector available_fields; if(!param.ifields.empty()) available_fields.push_back(¶m.ifields.front()); if(!param.ofields.empty()) available_fields.push_back(¶m.ofields.front()); // get iterator to the brick we will modify auto field = available_fields[i % available_fields.size()]; auto brick_iter = field->bricks.begin() + i % field->bricks.size(); // iterate through the 5 cases we want to test: switch(i % 5) { case 0: { // missing brick field->bricks.erase(brick_iter); break; } case 1: { // a brick's lower index too small by one size_t& index = brick_iter->lower[i % brick_iter->lower.size()]; // don't worry about underflow since that should also // produce an invalid brick layout --index; break; } case 2: { // a brick's lower index too large by one size_t& index = brick_iter->lower[i % brick_iter->lower.size()]; ++index; break; } case 3: { // a brick's upper index too small by one size_t& index = brick_iter->upper[i % brick_iter->lower.size()]; // don't worry about underflow since that should also // produce an invalid brick layout --index; break; } case 4: { // a brick's upper index too large by one size_t& index = brick_iter->upper[i % brick_iter->lower.size()]; ++index; break; } } rocfft_params rparam{param}; // brick layout is invalid, so this should fail try { rparam.setup_structs(); } catch(std::runtime_error&) { continue; } // didn't get an exception, fail the test GTEST_FAIL() << "invalid brick layout " << rparam.token() << " should have failed, but plan was created successfully"; } } } rocFFT-rocm-6.4.3/clients/tests/multithread_test.cpp000066400000000000000000000312051501537341300225000ustar00rootroot00000000000000// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/rocfft_params.h" #include "rocfft/rocfft.h" #include #include #include #include #include #include // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform __global__ void normalize_inverse_results(rocfft_complex* array, float N) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; array[idx].x /= N; array[idx].y /= N; } // Run a transform of specified dimensions, size N on each dimension. // Data is randomly generated based on the seed value, and we do a // forward + inverse transform and compare against what we started // with. struct Test_Transform { // real constructor sets all the data up and creates the plans Test_Transform(size_t _N, size_t _dim, uint32_t _seed) : N(_N) , dim(_dim) , seed(_seed) { // compute total data size size_t datasize = 1; for(size_t i = 0; i < dim; ++i) { datasize *= N; } size_t Nbytes = datasize * sizeof(rocfft_complex); // Create HIP device buffers if(device_mem_in.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); if(device_mem_out.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); // Initialize data std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); host_mem_in.resize(datasize); host_mem_out.resize(datasize); for(size_t i = 0; i < datasize; i++) { host_mem_in[i].x = dist(gen); host_mem_in[i].y = dist(gen); } // Copy data to device // NB: Cannot use ASSERT_EQ because constructor does not return void. EXPECT_EQ( hipMemcpy(device_mem_in.data(), host_mem_in.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } Test_Transform(const Test_Transform&) = delete; void operator=(const Test_Transform&) = delete; Test_Transform(Test_Transform&& other) : stream(std::move(other.stream)) , work_buffer(other.work_buffer) , device_mem_in(std::move(other.device_mem_in)) , device_mem_out(std::move(other.device_mem_out)) { other.work_buffer = nullptr; host_mem_in.swap(other.host_mem_in); host_mem_out.swap(other.host_mem_out); } void run_transform() { // Create rocFFT plans (forward + inverse) std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, rocfft_transform_type_complex_forward, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); ASSERT_EQ(rocfft_plan_create(&plan_inv, rocfft_placement_inplace, rocfft_transform_type_complex_inverse, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); // allocate work buffer if necessary ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); // NOTE: assuming that same-sized work buffer is ok for both // forward and inverse transforms if(work_buffer_size) { ASSERT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); } stream.alloc(); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); // NOTE: This multithread test is intended to test the cases having work_buffer_size // If the assert fails, this means we should change the problem. // But that rarely happens (maybe when the opt_strategy is minimal_buffer) // So we don't put this one inside the if(work_buffer_size){ ... } ASSERT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer, work_buffer_size), rocfft_status_success); // Execute forward plan out-of-place void* in_ptr = device_mem_in.data(); void* out_ptr = device_mem_out.data(); ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); // Execute inverse plan in-place ASSERT_EQ(rocfft_execute(plan_inv, &out_ptr, nullptr, info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); // Apply normalization so the values really are comparable hipLaunchKernelGGL(normalize_inverse_results, host_mem_out.size(), 1, 0, // sharedMemBytes stream, // stream static_cast*>(device_mem_out.data()), static_cast(host_mem_out.size())); ran_transform = true; } void do_cleanup() { // complain loudly if we set up for a transform but did not // actually run it if(plan && !ran_transform) ADD_FAILURE(); // wait for execution to finish if(stream) { ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); stream.free(); } ASSERT_EQ(hipFree(work_buffer), hipSuccess); work_buffer = nullptr; ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); plan = nullptr; ASSERT_EQ(rocfft_plan_destroy(plan_inv), rocfft_status_success); plan_inv = nullptr; // Copy result back to host if(device_mem_out.data() && !host_mem_out.empty()) { ASSERT_EQ(hipMemcpy(host_mem_out.data(), device_mem_out.data(), host_mem_out.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm = norm_complex(reinterpret_cast*>(host_mem_in.data()), host_mem_in.size(), 1, 1, host_mem_in.size(), {0}); auto diff = distance_1to1_complex( reinterpret_cast*>(host_mem_in.data()), reinterpret_cast*>(host_mem_out.data()), // data is all contiguous, we can treat it as 1d host_mem_in.size(), 1, 1, host_mem_in.size(), 1, host_mem_out.size(), nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); EXPECT_LT(diff.l_2 / input_norm.l_2, sqrt(log2(host_mem_in.size())) * MAX_TRANSFORM_ERROR); EXPECT_LT(diff.l_inf / input_norm.l_inf, log2(host_mem_in.size()) * MAX_TRANSFORM_ERROR); // Free buffers host_mem_in.clear(); host_mem_out.clear(); } } ~Test_Transform() { do_cleanup(); } size_t N = 0; size_t dim = 0; uint32_t seed = 0; hipStream_wrapper_t stream; rocfft_plan plan = nullptr; rocfft_plan plan_inv = nullptr; size_t work_buffer_size = 0; void* work_buffer = nullptr; gpubuf device_mem_in; gpubuf device_mem_out; std::vector> host_mem_in; std::vector> host_mem_out; // ensure that we don't forget to actually run the transform bool ran_transform = false; }; // run concurrent transforms, one per thread, size N on each dimension static void multithread_transform(size_t N, size_t dim, size_t num_threads) { std::vector threads; threads.reserve(num_threads); for(size_t j = 0; j < num_threads; ++j) { threads.emplace_back([=]() { try { Test_Transform t(N, dim, j); t.run_transform(); } catch(std::bad_alloc& e) { ADD_FAILURE() << "memory allocation failure"; } }); } for(auto& t : threads) t.join(); } // for multi-stream tests, set up a bunch of streams, then execute // all of those transforms from a single thread. afterwards, // wait/verify/cleanup in parallel to save wall time during the test. static void multistream_transform(size_t N, size_t dim, size_t num_streams) { std::vector> transforms; transforms.resize(num_streams); std::vector threads; threads.reserve(num_streams); // get all data ready in parallel for(size_t i = 0; i < num_streams; ++i) threads.emplace_back([=, &transforms]() { try { transforms[i] = std::make_unique(N, dim, i); } catch(std::bad_alloc&) { ADD_FAILURE() << "memory allocation failure"; } }); for(auto& t : threads) t.join(); threads.clear(); // now start the actual transforms serially, but in separate // streams for(auto& t : transforms) { if(!t) // must have failed to allocate memory, abort the test return; t->run_transform(); } // clean up for(size_t i = 0; i < transforms.size(); ++i) threads.emplace_back([=, &transforms]() { transforms[i]->do_cleanup(); }); for(auto& t : threads) t.join(); } // pick arbitrary sizes here to get some parallelism while still // fitting into e.g. 8 GB of GPU memory TEST(DISABLED_rocfft_UnitTest, simple_multithread_1D) { multithread_transform(1048576, 1, 64); } TEST(DISABLED_rocfft_UnitTest, simple_multithread_2D) { multithread_transform(1024, 2, 64); } TEST(DISABLED_rocfft_UnitTest, simple_multithread_3D) { multithread_transform(128, 3, 40); } TEST(rocfft_UnitTest, simple_multistream_1D) { multistream_transform(1048576, 1, 32); } TEST(rocfft_UnitTest, simple_multistream_2D) { multistream_transform(1024, 2, 32); } TEST(rocfft_UnitTest, simple_multistream_3D) { multistream_transform(128, 3, 32); } rocFFT-rocm-6.4.3/clients/tests/random.cpp000066400000000000000000000115411501537341300204000ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_accuracy_test.h" #include "../../shared/test_params.h" class random_params : public ::testing::TestWithParam< std::tuple> { }; // TODO: Add batch and stride auto random_param_generator(const int dimension, const std::vector& precisions, const std::vector& place_range, const fft_transform_type transform_type) { std::vector params; int maxlen = 0; switch(dimension) { case 1: maxlen = 1 << 15; break; case 2: maxlen = 1 << 10; break; case 3: maxlen = 1 << 6; break; default: throw std::runtime_error("invalid dimension for random tests"); } std::mt19937 rgen(random_seed); // Mean value of the exponential distribution is maxlen: std::exponential_distribution distribution(1.0 / maxlen); std::uniform_int_distribution precision_distr(0, precisions.size() - 1); std::uniform_int_distribution place_distr(0, place_range.size() - 1); while(params.size() < n_random_tests) { const auto precision = precisions[precision_distr(rgen)]; const auto placement = place_range[place_distr(rgen)]; fft_params param; param.transform_type = transform_type; param.precision = precision; param.placement = placement; for(int idim = 0; idim < dimension; ++idim) { // NB: the distribution can return 0, so add 1 to avoid this issue. param.length.push_back(1 + (size_t)distribution(rgen)); } param.validate(); if(param.valid(0)) { bool found = false; for(size_t idx = 0; idx < params.size(); ++idx) { if(param.token() == params[idx].token()) { found = true; break; } } if(!found) { params.push_back(param); } } } return params; } INSTANTIATE_TEST_SUITE_P( random_complex_1d, accuracy_test, ::testing::ValuesIn(random_param_generator( 1, precision_range_sp_dp, place_range, fft_transform_type_complex_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_complex_2d, accuracy_test, ::testing::ValuesIn(random_param_generator( 2, precision_range_sp_dp, place_range, fft_transform_type_complex_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_complex_3d, accuracy_test, ::testing::ValuesIn(random_param_generator( 3, precision_range_sp_dp, place_range, fft_transform_type_complex_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_real_1d, accuracy_test, ::testing::ValuesIn(random_param_generator( 1, precision_range_sp_dp, place_range, fft_transform_type_real_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_real_2d, accuracy_test, ::testing::ValuesIn(random_param_generator( 2, precision_range_sp_dp, place_range, fft_transform_type_real_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_real_3d, accuracy_test, ::testing::ValuesIn(random_param_generator( 3, precision_range_sp_dp, place_range, fft_transform_type_real_forward)), accuracy_test::TestName); rocFFT-rocm-6.4.3/clients/tests/rocfft_accuracy_test.cpp000066400000000000000000000107571501537341300233240ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include "../../shared/rocfft_accuracy_test.h" #include "../../shared/client_except.h" #include "../../shared/fftw_transform.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/subprocess.h" #include "rocfft/rocfft.h" extern std::string mp_launch; extern last_cpu_fft_cache last_cpu_fft_data; void fft_vs_reference(rocfft_params& params, bool round_trip) { switch(params.precision) { case fft_precision_half: fft_vs_reference_impl(params, round_trip); break; case fft_precision_single: fft_vs_reference_impl(params, round_trip); break; case fft_precision_double: fft_vs_reference_impl(params, round_trip); break; } } // Test for comparison between FFTW and rocFFT. TEST_P(accuracy_test, vs_fftw) { rocfft_params params(GetParam()); params.validate(); // Test that the tokenization works as expected. auto testcase_token = params.token(); fft_params tokentest; tokentest.from_token(testcase_token); auto testcase_token1 = tokentest.token(); EXPECT_EQ(testcase_token, testcase_token1); if(!params.valid(verbose)) { GTEST_FAIL() << "Invalid parameters"; } switch(params.mp_lib) { case fft_params::fft_mp_lib_none: { // Single-proc FFT. // Only do round trip for non-field FFTs bool round_trip = params.ifields.empty() && params.ofields.empty(); try { fft_vs_reference(params, round_trip); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } break; } case fft_params::fft_mp_lib_mpi: { // Multi-proc FFT. // Split launcher into tokens since the first one is the exe // and the remainder is the start of its argv boost::escaped_list_separator sep('\\', ' ', '\"'); boost::tokenizer> tokenizer(mp_launch, sep); std::string exe; std::vector argv; for(auto t : tokenizer) { if(t.empty()) continue; if(exe.empty()) exe = t; else argv.push_back(t); } // append test token and ask for accuracy test argv.push_back("--token"); argv.push_back(testcase_token); argv.push_back("--accuracy"); // throws an exception if launch fails or if subprocess // returns nonzero exit code execute_subprocess(exe, argv, {}); break; } default: GTEST_FAIL() << "Invalid communicator choice!"; break; } SUCCEED(); } rocFFT-rocm-6.4.3/clients/tests/rocfft_mpi_worker.cpp000066400000000000000000000040001501537341300226310ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "../../shared/mpi_worker.h" #include "../../shared/rocfft_params.h" int main(int argc, char* argv[]) { #ifdef ROCFFT_DYNA_MPI_WORKER return mpi_worker_main, true>( "dynamic rocFFT MPI worker process", argc, argv, [](const std::vector& lib_strings) { std::vector all_params; for(auto& lib : lib_strings) all_params.emplace_back(lib); return all_params; }); #else return mpi_worker_main, false>( "rocFFT MPI worker process", argc, argv, [](const std::vector&) { return std::array(); }); #endif } rocFFT-rocm-6.4.3/clients/tests/rtc_helper_crash.cpp000066400000000000000000000025471501537341300224350ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // just crash int main() { char* a = 0; // NOTE: this is supposed to crash, since it's used in a test // that checks crashing child processes. // // cppcheck-suppress nullPointer *a = 0; return 0; } rocFFT-rocm-6.4.3/clients/tests/unit_test.cpp000066400000000000000000000654521501537341300211500ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "rocfft/rocfft.h" #include "../../shared/concurrency.h" #include "../../shared/environment.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_complex.h" #include "hip/hip_runtime_api.h" #include #include #include #include #include #include #include #include #include #ifdef _OPENMP #include #endif #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif namespace fs = std::filesystem; #ifndef WIN32 // get program_invocation_name #include #endif TEST(rocfft_UnitTest, plan_description) { rocfft_plan_description desc = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_create(&desc)); rocfft_array_type in_array_type = rocfft_array_type_complex_interleaved; rocfft_array_type out_array_type = rocfft_array_type_complex_interleaved; size_t rank = 1; size_t i_strides[3] = {1, 1, 1}; size_t o_strides[3] = {1, 1, 1}; size_t idist = 0; size_t odist = 0; rocfft_plan plan = NULL; size_t length = 8; ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_set_data_layout(desc, in_array_type, out_array_type, 0, 0, rank, i_strides, idist, rank, o_strides, odist)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, rank, &length, 1, desc)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_destroy(desc)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_destroy(plan)); } TEST(rocfft_UnitTest, plan_description_reuse) { // check that a plan description can be reused between different // plans, with different layout parameters for each. // allocate plan description once rocfft_plan_description desc = nullptr; ASSERT_EQ(rocfft_plan_description_create(&desc), rocfft_status_success); std::vector> output; // do length-8 FFTs with different strides. first one is // stride-1 and we use that as our baseline to know what output // to expect for the rest const size_t length = 8; for(const size_t stride : {1, 2, 4}) { // set layout for this stride ASSERT_EQ(rocfft_plan_description_set_data_layout(desc, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, 1, &stride, length * stride, 1, &stride, length * stride), rocfft_status_success); static const rocfft_complex input[8]{{-0.100, 0.380}, {0.0166, 0.439}, {-0.475, 0.212}, {0.440, -0.432}, {0.445, 0.0589}, {0.296, 0.164}, {-0.084, 0.077}, {0.320, 0.087}}; // allocate host buffer. initialize the whole thing to zero // but set a known input along the strides we want std::vector> data_host(length * stride, {0.0, 0.0}); for(size_t i = 0; i < length; ++i) { data_host[i * stride] = input[i]; } // copy to device const size_t data_bytes = data_host.size() * sizeof(rocfft_complex); gpubuf_t> data_dev; ASSERT_EQ(data_dev.alloc(data_bytes), hipSuccess); void* data_dev_ptr = data_dev.data(); ASSERT_EQ(hipMemcpy(data_dev_ptr, data_host.data(), data_bytes, hipMemcpyHostToDevice), hipSuccess); // do the transform rocfft_plan plan = nullptr; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, desc), rocfft_status_success); ASSERT_EQ(rocfft_execute(plan, &data_dev_ptr, nullptr, nullptr), rocfft_status_success); ASSERT_EQ(hipMemcpy(data_host.data(), data_dev_ptr, data_bytes, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(hipDeviceSynchronize(), hipSuccess); // save output for reference on first run if(output.empty()) { output = data_host; } else { // check that the output matches output from the first // (stride-1) run. for(size_t i = 0; i < length; ++i) ASSERT_EQ(data_host[i * stride], output[i]); } ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); } ASSERT_EQ(rocfft_plan_description_destroy(desc), rocfft_status_success); } // Check whether logs can be emitted from multiple threads properly TEST(rocfft_UnitTest, log_multithreading) { static const int NUM_THREADS = 10; static const int NUM_ITERS_PER_THREAD = 50; static const char* TRACE_FILE = "trace.log"; // clean up environment and temporary file when we exit BOOST_SCOPE_EXIT_ALL(=) { rocfft_cleanup(); remove(TRACE_FILE); // re-init logs with default logging rocfft_setup(); }; // ask for trace logging, since that's the easiest to trigger rocfft_cleanup(); EnvironmentSetTemp layer("ROCFFT_LAYER", "1"); EnvironmentSetTemp tracepath("ROCFFT_LOG_TRACE_PATH", TRACE_FILE); rocfft_setup(); // run a whole bunch of threads in parallel, each one doing // something small that will write to the trace log std::vector threads; threads.reserve(NUM_THREADS); for(int i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([]() { for(int j = 0; j < NUM_ITERS_PER_THREAD; ++j) { rocfft_plan_description desc; rocfft_plan_description_create(&desc); rocfft_plan_description_destroy(desc); } }); } for(auto& t : threads) { t.join(); } rocfft_cleanup(); // now verify that the trace log has one message per line, with nothing garbled std::ifstream trace_log(TRACE_FILE); std::string line; std::regex validator("^rocfft_(setup|cleanup|plan_description_(create|destroy)," "description,[x0-9a-fA-F]+)$"); while(std::getline(trace_log, line)) { bool res = std::regex_match(line, validator); ASSERT_TRUE(res) << "line contains invalid content: " << line; } } // a function that accepts a plan's requested size on input, and // returns the size to actually allocate for the test typedef std::function workmem_sizer; void workmem_test(workmem_sizer sizer, rocfft_status exec_status_expected, bool give_null_work_buf = false) { // Prime size requires Bluestein, which guarantees work memory. size_t length = 8191; rocfft_plan plan = NULL; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, nullptr), rocfft_status_success); size_t requested_work_size = 0; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &requested_work_size), rocfft_status_success); ASSERT_GT(requested_work_size, 0U); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); size_t alloc_work_size = sizer(requested_work_size); gpubuf work_buffer; if(alloc_work_size) { ASSERT_EQ(work_buffer.alloc(alloc_work_size), hipSuccess); void* work_buffer_ptr; rocfft_status set_work_expected_status; if(give_null_work_buf) { work_buffer_ptr = nullptr; set_work_expected_status = rocfft_status_invalid_work_buffer; } else { work_buffer_ptr = work_buffer.data(); set_work_expected_status = rocfft_status_success; } ASSERT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer_ptr, alloc_work_size), set_work_expected_status); } // allocate 2x length for complex std::vector data_host(length * 2, 1.0f); gpubuf data_device; auto data_size_bytes = data_host.size() * sizeof(float); ASSERT_EQ(data_device.alloc(data_size_bytes), hipSuccess); ASSERT_EQ( hipMemcpy(data_device.data(), data_host.data(), data_size_bytes, hipMemcpyHostToDevice), hipSuccess); std::vector ibuffers(1, static_cast(data_device.data())); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, info), exec_status_expected); rocfft_execution_info_destroy(info); rocfft_plan_destroy(plan); } // check what happens if work memory is required but is not provided // - library should allocate TEST(rocfft_UnitTest, workmem_missing) { workmem_test([](size_t) { return 0; }, rocfft_status_success); } // check what happens if work memory is required but not enough is provided TEST(rocfft_UnitTest, workmem_small) { workmem_test([](size_t requested) { return requested / 2; }, rocfft_status_invalid_work_buffer); } // hard to imagine this being a problem, but try giving too much as well TEST(rocfft_UnitTest, workmem_big) { workmem_test([](size_t requested) { return requested * 2; }, rocfft_status_success); } // check if a user explicitly gives a null pointer - set work buffer // should fail, but transform should succeed because library // allocates TEST(rocfft_UnitTest, workmem_null) { workmem_test([](size_t requested) { return requested; }, rocfft_status_success, true); } static const size_t RTC_PROBLEM_SIZE = 2304; // runtime compilation cache tests TEST(rocfft_UnitTest, rtc_cache) { // PRECONDITIONS // - set cache location to custom path, requires uninitializing // the lib and reinitializing with some env vars // - also enable RTC logging so we can tell when something was // actually compiled const std::string rtc_cache_path = std::tmpnam(nullptr); const std::string rtc_log_path = std::tmpnam(nullptr); void* empty_cache = nullptr; size_t empty_cache_bytes = 0; void* onekernel_cache = nullptr; size_t onekernel_cache_bytes = 0; // cleanup BOOST_SCOPE_EXIT_ALL(=) { // close log file handles rocfft_cleanup(); remove(rtc_cache_path.c_str()); remove(rtc_log_path.c_str()); // re-init lib now that the env vars are gone rocfft_setup(); if(empty_cache) rocfft_cache_buffer_free(empty_cache); if(onekernel_cache) rocfft_cache_buffer_free(onekernel_cache); }; rocfft_cleanup(); EnvironmentSetTemp cache_env("ROCFFT_RTC_CACHE_PATH", rtc_cache_path.c_str()); EnvironmentSetTemp layer_env("ROCFFT_LAYER", "32"); EnvironmentSetTemp log_env("ROCFFT_LOG_RTC_PATH", rtc_log_path.c_str()); rocfft_setup(); // - serialize empty cache as baseline ASSERT_EQ(rocfft_cache_serialize(&empty_cache, &empty_cache_bytes), rocfft_status_success); // END PRECONDITIONS // pick a length that's runtime compiled auto build_plan = [&]() { rocfft_plan plan = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); // we don't need to actually execute the plan, so we can // destroy it right away. this ensures that we don't hold on // to a plan after we cleanup the library rocfft_plan_destroy(plan); plan = nullptr; }; // check the RTC log to see if an FFT kernel got compiled auto fft_kernel_was_compiled = [&]() { // HACK: logging is done in a worker thread, so sleep for a // bit to give it a chance to actually write. It at least // should flush after writing. std::this_thread::sleep_for(std::chrono::milliseconds(100)); // look for a ROCFFT_RTC_BEGIN line that indicates RTC happened std::ifstream logfile(rtc_log_path); std::string line; while(std::getline(logfile, line)) { if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos && line.find("fft_") != std::string::npos) return true; } return false; }; // build a plan that requires runtime compilation, // close logs and ensure a kernel was built build_plan(); ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); // serialized cache should be bigger than empty cache ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // blow away the cache, reinit the library, // retry building the plan again and ensure the kernel was rebuilt remove(rtc_cache_path.c_str()); rocfft_setup(); build_plan(); rocfft_cache_buffer_free(onekernel_cache); onekernel_cache = nullptr; ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // re-init library without blowing away cache. rebuild plan and // check that the kernel was not recompiled. rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // blow away cache again, deserialize one-kernel cache. re-init // library and rebuild plan - kernel should again not be // recompiled remove(rtc_cache_path.c_str()); rocfft_setup(); ASSERT_EQ(rocfft_cache_deserialize(onekernel_cache, onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // use the cache as a system cache and make the user one an empty // in-memory cache. kernel should still not be recompiled. EnvironmentSetTemp cache_sys_env("ROCFFT_RTC_SYS_CACHE_PATH", rtc_cache_path.c_str()); EnvironmentSetTemp cache_empty_env("ROCFFT_RTC_CACHE_PATH", ":memory:"); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // check that the system cache is not written to, even if it's // writable by the current user. after removing the cache, the // kernel should always be recompiled since rocFFT has no durable // place to write it to. remove(rtc_cache_path.c_str()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); } // make sure cache API functions tolerate null pointers without crashing TEST(rocfft_UnitTest, rtc_cache_null) { void* buf = nullptr; size_t buf_len = 0; ASSERT_EQ(rocfft_cache_serialize(nullptr, &buf_len), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_serialize(&buf, nullptr), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_buffer_free(nullptr), rocfft_status_success); ASSERT_EQ(rocfft_cache_deserialize(nullptr, 12345), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_deserialize(&buf_len, 0), rocfft_status_invalid_arg_value); } // make sure RTC gracefully handles a helper process that crashes TEST(rocfft_UnitTest, rtc_helper_crash) { #ifdef WIN32 char filename[MAX_PATH]; GetModuleFileNameA(NULL, filename, MAX_PATH); fs::path test_exe = filename; fs::path crasher_exe = test_exe.replace_filename("rtc_helper_crash.exe"); #else fs::path test_exe = program_invocation_name; fs::path crasher_exe = test_exe.replace_filename("rtc_helper_crash"); #endif // use the crashing helper EnvironmentSetTemp env_helper("ROCFFT_RTC_PROCESS_HELPER", crasher_exe.string().c_str()); // don't touch the cache, to force compilation EnvironmentSetTemp env_read("ROCFFT_RTC_CACHE_READ_DISABLE", "1"); EnvironmentSetTemp env_write("ROCFFT_RTC_CACHE_WRITE_DISABLE", "1"); // force out-of-process compile EnvironmentSetTemp env_process("ROCFFT_RTC_PROCESS", "2"); rocfft_plan plan = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); // alloc a complex buffer gpubuf_t> data; ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(rocfft_complex)), hipSuccess); std::vector ibuffers(1, static_cast(data.data())); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); plan = nullptr; rocfft_cleanup(); rocfft_setup(); // also try with forcing use of the subprocess, which is a // different code path from the default "try in-process, then // fall back to out-of-process" EnvironmentSetTemp env_force("ROCFFT_RTC_PROCESS", "1"); ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); plan = nullptr; } TEST(rocfft_UnitTest, rtc_test_harness) { // check that hipcc is available since this test requires it // // NOTE: using system() for launching subprocesses for simplicity // and portability #ifdef WIN32 static const char* test_command = "amdclang++ --version > NUL"; #else static const char* test_command = "amdclang++ --version > /dev/null"; #endif if(std::system(test_command) != 0) GTEST_SKIP(); rocfft_cleanup(); BOOST_SCOPE_EXIT_ALL() { // reinit rocFFT so caching goes back to normal rocfft_cleanup(); rocfft_setup(); }; // extra scope to control lifetime of env vars { // rtc test harness writes to system's temp directory auto tmp_path = fs::temp_directory_path(); // activate writing of rtc test harnesses EnvironmentSetTemp env_harness("ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS", "1"); // set path for writing rtc test harnesses source files EnvironmentSetTemp env_harness_path("ROCFFT_DEBUG_KERNEL_HARNESS_PATH", tmp_path.string().c_str()); // ensure every kernel gets compiled once EnvironmentSetTemp env_cache("ROCFFT_RTC_CACHE_PATH", ":memory:"); EnvironmentSetTemp env_sys_cache("ROCFFT_RTC_SYS_CACHE_PATH", ":memory:"); rocfft_setup(); // ensure stale files from previous runs of this test won't cause // problems - clean up any rocfft_kernel_harness_*.cpp files that // might be left behind for(const auto& entry : std::filesystem::directory_iterator{tmp_path}) { auto filename = entry.path().filename(); if(filename.string().compare(0, 22, "rocfft_kernel_harness_") == 0 && filename.extension().string() == ".cpp") fs::remove(entry); } // construct a few different types of plans to try to get all // different kernels compiled auto create_destroy_plan = [](rocfft_transform_type type, const size_t dim, const size_t* lengths) -> void { rocfft_plan plan = nullptr; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, type, rocfft_precision_single, dim, lengths, 1, nullptr), rocfft_status_success); ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); plan = nullptr; }; // large 1D R2C + C2R const size_t L1D_PROBLEM_SIZE[1] = {16384}; create_destroy_plan(rocfft_transform_type_real_forward, 1, L1D_PROBLEM_SIZE); create_destroy_plan(rocfft_transform_type_real_inverse, 1, L1D_PROBLEM_SIZE); // small bluestein R2C + C2R (also covers odd length) const size_t SMALL_BLUESTEIN_PROBLEM_SIZE[1] = {37}; create_destroy_plan(rocfft_transform_type_real_forward, 1, SMALL_BLUESTEIN_PROBLEM_SIZE); create_destroy_plan(rocfft_transform_type_real_inverse, 1, SMALL_BLUESTEIN_PROBLEM_SIZE); // large bluestein C2C const size_t LARGE_BLUESTEIN_PROBLEM_SIZE[1] = {8191}; create_destroy_plan(rocfft_transform_type_complex_forward, 1, LARGE_BLUESTEIN_PROBLEM_SIZE); // L1D_TRTRT const size_t L1D_TRTRT_PROBLEM_SIZE[1] = {680}; create_destroy_plan(rocfft_transform_type_complex_forward, 1, L1D_TRTRT_PROBLEM_SIZE); // small 3D (exercises 2D_SINGLE) const size_t SMALL_3D_PROBLEM_SIZE[3] = {25, 25, 25}; create_destroy_plan(rocfft_transform_type_complex_forward, 3, SMALL_3D_PROBLEM_SIZE); // larger 3D const size_t LARGE_3D_PROBLEM_SIZE[3] = {200, 200, 200}; create_destroy_plan(rocfft_transform_type_complex_forward, 3, LARGE_3D_PROBLEM_SIZE); // now try to compile each file - they'd need hand-editing to test // something useful, but we can at least ensure they build. // enumerate all the files std::vector> files; size_t i = 0; for(;; ++i) { // construct name of main file fs::path main_file = tmp_path / ("rocfft_kernel_harness_" + std::to_string(i) + ".cpp"); if(!fs::exists(main_file)) break; files.emplace_back(main_file.string(), -1); } // we should have generated at least a few kernels ASSERT_FALSE(files.empty()); #ifdef _OPENMP #pragma omp parallel for num_threads(rocfft_concurrency()) #endif for(i = 0; i < files.size(); ++i) { #ifdef WIN32 const std::string command = "amdclang++ -x hip -c -std=c++17 -o NUL " + files[i].first; #else const std::string command = "amdclang++ -x hip -c -std=c++17 -o /dev/null " + files[i].first; #endif files[i].second = std::system(command.c_str()); } // check that all compiles succeeded for(const auto& file : files) ASSERT_EQ(file.second, 0); } } rocFFT-rocm-6.4.3/clients/tests/validate_length_stride.cpp000066400000000000000000000075601501537341300236320ustar00rootroot00000000000000// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/array_validator.h" #include #include #include inline auto generate_valid_length_stride() { // Array of tuples of length, stride. std::vector, std::vector>> vals = { {{8}, {1}}, {{8, 2}, {1, 0}}, {{8, 8}, {8, 1}}, {{8, 8, 8}, {64, 8, 1}}, {{8, 8, 8}, {64, 7, 1}}, {{8, 8, 8, 8}, {512, 64, 7, 1}}, {{8, 8, 8, 8}, {512, 64, 8, 1}}, {{8, 8, 8, 8, 8}, {4096, 512, 64, 8, 1}}, {{8, 8, 8, 8, 8}, {4096, 512, 64, 7, 1}}, {{8, 8, 8, 8, 8, 8}, {32768, 4096, 512, 64, 8, 1}}, {{299, 307, 495}, {1006, 50, 674}}, }; return vals; } class valid_length_stride : public ::testing::TestWithParam, std::vector>> { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; auto direct_validity_test(const std::vector& length, const std::vector& stride, const int verbose) { std::unordered_set vals{}; std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const int i = std::inner_product(index.begin(), index.end(), stride.begin(), (size_t)0); if(vals.find(i) == vals.end()) { vals.insert(i); } else { return false; } } while(increment_rowmajor(index, length)); return true; } TEST_P(valid_length_stride, direct_comparison) { const std::vector length = std::get<0>(GetParam()); const std::vector stride = std::get<1>(GetParam()); if(verbose) { std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; std::cout << "stride:"; for(const auto i : stride) std::cout << " " << i; std::cout << "\n"; } auto test_val = array_valid(length, stride, verbose); if(verbose) { std::cout << "test value is: " << (test_val ? "valid" : "invalid") << "\n"; } auto ref_val = direct_validity_test(length, stride, verbose); if(verbose) { std::cout << "reference value is: " << (ref_val ? "valid" : "invalid") << "\n"; } EXPECT_EQ(test_val, ref_val); SUCCEED(); } INSTANTIATE_TEST_SUITE_P(reference_test, valid_length_stride, ::testing::ValuesIn(generate_valid_length_stride())); rocFFT-rocm-6.4.3/cmake/000077500000000000000000000000001501537341300146675ustar00rootroot00000000000000rocFFT-rocm-6.4.3/cmake/get-cli-arguments.cmake000066400000000000000000000041601501537341300212210ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Attempt (best effort) to return a list of user specified parameters cmake was invoked with # NOTE: Even if the user specifies CMAKE_INSTALL_PREFIX on the command line, the parameter is # not returned because it does not have the matching helpstring function( append_cmake_cli_arguments initial_cli_args return_cli_args ) # Retrieves the contents of CMakeCache.txt get_cmake_property( cmake_properties CACHE_VARIABLES ) foreach( property ${cmake_properties} ) get_property(help_string CACHE ${property} PROPERTY HELPSTRING ) # Properties specified on the command line have boilerplate text if( help_string MATCHES "variable specified on the command line" ) # message( STATUS "property: ${property}") # message( STATUS "value: ${${property}}") list( APPEND cli_args "-D${property}=${${property}}") endif( ) endforeach( ) # message( STATUS "get_command_line_arguments: ${cli_args}") set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) endfunction( )rocFFT-rocm-6.4.3/cmake/package-functions.cmake000066400000000000000000000040651501537341300212770ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ######################################################################## # A helper function to generate packaging scripts to register libraries with system # ######################################################################## function( write_rocm_package_script_files scripts_write_dir library_name library_link_name ) set( ld_conf_file "/etc/ld.so.conf.d/${library_name}-dev.conf" ) file( WRITE ${scripts_write_dir}/postinst "#!/bin/bash set -e do_ldconfig() { echo ${CPACK_PACKAGING_INSTALL_PREFIX}/${LIB_INSTALL_DIR} > ${ld_conf_file} && ldconfig } case \"\$1\" in configure) do_ldconfig ;; abort-upgrade|abort-remove|abort-deconfigure) echo \"\$1\" ;; *) exit 0 ;; esac " ) file( WRITE ${scripts_write_dir}/prerm "#!/bin/bash set -e rm_ldconfig() { rm -f ${ld_conf_file} && ldconfig } case \"\$1\" in remove|purge) rm_ldconfig ;; *) exit 0 ;; esac " ) endfunction( ) rocFFT-rocm-6.4.3/cmake/sqlite.cmake000066400000000000000000000057531501537341300172040ustar00rootroot00000000000000# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include( ExternalProject ) # SQLite 3.36.0 enabled the backup API by default, which we need # for cache serialization. We also want to use a static SQLite, # and distro static libraries aren't typically built # position-independent. option( SQLITE_USE_SYSTEM_PACKAGE "Use SQLite3 from find_package" OFF ) if( SQLITE_USE_SYSTEM_PACKAGE ) find_package(SQLite3 3.36 REQUIRED) list(APPEND static_depends PACKAGE SQLite3) set(ROCFFT_SQLITE_LIB SQLite::SQLite3) else() include( FetchContent ) if(DEFINED ENV{SQLITE_3_43_2_SRC_URL}) set(SQLITE_3_43_2_SRC_URL_INIT $ENV{SQLITE_3_43_2_SRC_URL}) else() set(SQLITE_3_43_2_SRC_URL_INIT https://www.sqlite.org/2023/sqlite-amalgamation-3430200.zip) endif() set(SQLITE_3_43_2_SRC_URL ${SQLITE_3_43_2_SRC_URL_INIT} CACHE STRING "Location of SQLite source code") set(SQLITE_SRC_3_43_2_SHA3_256 af02b88cc922e7506c6659737560c0756deee24e4e7741d4b315af341edd8b40 CACHE STRING "SHA3-256 hash of SQLite source code") # embed SQLite if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) # use extract timestamp for fetched files instead of timestamps in the archive cmake_policy(SET CMP0135 NEW) endif() FetchContent_Declare(sqlite_local URL ${SQLITE_3_43_2_SRC_URL} URL_HASH SHA3_256=${SQLITE_SRC_3_43_2_SHA3_256} ) FetchContent_MakeAvailable(sqlite_local) if(NOT TARGET sqlite3) add_library( sqlite3 OBJECT ${sqlite_local_SOURCE_DIR}/sqlite3.c ) target_include_directories( sqlite3 PUBLIC ${sqlite_local_SOURCE_DIR} ) set_target_properties( sqlite3 PROPERTIES C_VISIBILITY_PRESET "hidden" VISIBILITY_INLINES_HIDDEN ON POSITION_INDEPENDENT_CODE ON ) endif() # we don't need extensions, and omitting them from SQLite removes the # need for dlopen/dlclose from within rocFFT target_compile_options( sqlite3 PRIVATE -DSQLITE_OMIT_LOAD_EXTENSION ) set(ROCFFT_SQLITE_LIB sqlite3) endif() rocFFT-rocm-6.4.3/cmake/std-filesystem.cmake000066400000000000000000000036521501537341300206530ustar00rootroot00000000000000# Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include(CheckCXXSourceCompiles) set(HAVE_STD_FILESYSTEM_TEST [[ #include int main() { std::filesystem::path p{"/"}; return 0; } ]]) set(CMAKE_REQUIRED_FLAGS -std=c++17) check_cxx_source_compiles("${HAVE_STD_FILESYSTEM_TEST}" HAVE_STD_FILESYSTEM) if(NOT HAVE_STD_FILESYSTEM) message(STATUS "std::filesystem include not present, will use std::experimental::filesystem") endif() # Link to the experimental filesystem library if it's not available # in the standard library. Experimental filesystem library is not # ABI-compatible with later libstdc++ so link that statically too. function(target_link_std_experimental_filesystem target) if(NOT HAVE_STD_FILESYSTEM) target_link_options( ${target} PRIVATE "SHELL:-lstdc++fs -static-libstdc++ -Xlinker --exclude-libs=ALL") endif() endfunction() rocFFT-rocm-6.4.3/custom.properties000066400000000000000000000001351501537341300172360ustar00rootroot00000000000000booktitle=rocFFT API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocFFT-rocm-6.4.3/deps/000077500000000000000000000000001501537341300145425ustar00rootroot00000000000000rocFFT-rocm-6.4.3/deps/CMakeLists.txt000066400000000000000000000074671501537341300173200ustar00rootroot00000000000000# Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Helper cmake script to automate building dependencies for rocfft # This script can be invoked manually by the user with 'cmake -P' # The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 cmake_minimum_required( VERSION 3.5 ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) # Consider removing this in the future # It can be annoying for visual studio developers to build a project that tries to install into 'program files' if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() # The superbuild does not build anything itself; all compiling is done in external projects project( rocfft-dependencies NONE ) option( BUILD_BOOST "Download and build boost library" ON ) # option( BUILD_VERBOSE "Print helpful build debug information" OFF ) # if( BUILD_VERBOSE ) # message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) # message( STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}" ) # message( STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_FILE: ${CMAKE_CURRENT_LIST_FILE}" ) # endif( ) # This module scrapes the CMakeCache.txt file and attempts to get all the cli options the user specified to cmake invocation include( get-cli-arguments ) # The following is a series of super-build projects; this cmake project will download and build if( BUILD_BOOST ) set(ext.BUILD_BOOST "static") include( external-boost ) list( APPEND rocfft_dependencies boost ) set( boost_custom_target COMMAND cd ${BOOST_BINARY_ROOT}$ ${Boost.Command} install ) endif( ) # POLICY CMP0037 - "Target names should not be reserved and should match a validity pattern" # Familiar target names like 'install' should be OK at the super-build level if( POLICY CMP0037 ) cmake_policy( SET CMP0037 OLD ) endif( ) add_custom_target( install ${boost_custom_target} ${gtest_custom_target} ${lapack_custom_target} DEPENDS ${rocfft_dependencies} ) rocFFT-rocm-6.4.3/deps/external-boost.cmake000066400000000000000000000166071501537341300205240ustar00rootroot00000000000000# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. message( STATUS "Configuring boost external dependency" ) include( ExternalProject ) set( PREFIX_BOOST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) # We need to detect the compiler the user is attempting to invoke with CMake, # we do our best to translate cmake parameters into bjam parameters enable_language( CXX ) include( build-bitness ) # TODO: Options should be added to allow downloading Boost straight from github # This file is used to add Boost as a library dependency to another project # This sets up boost to download from sourceforge, and builds it as a cmake # ExternalProject # Change this one line to upgrade to newer versions of boost set( ext.Boost_VERSION "1.64.0" CACHE STRING "Boost version to download/use" ) mark_as_advanced( ext.Boost_VERSION ) string( REPLACE "." "_" ext.Boost_Version_Underscore ${ext.Boost_VERSION} ) message( STATUS "ext.Boost_VERSION: " ${ext.Boost_VERSION} ) if( WIN32 ) # For newer cmake versions, 7z archives are much smaller to download if( CMAKE_VERSION VERSION_LESS "3.1.0" ) set( Boost_Ext "zip" ) else( ) set( Boost_Ext "7z" ) endif( ) else( ) set( Boost_Ext "tar.bz2" ) endif( ) if( WIN32 ) set( Boost.Command b2 --prefix=${PREFIX_BOOST} ) else( ) set( Boost.Command ./b2 --prefix=${PREFIX_BOOST} ) endif( ) if( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command cxxflags=-fPIC -std=c++11 ) elseif( XCODE_VERSION OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang") ) list( APPEND Boost.Command cxxflags=-std=c++11 -stdlib=libc++ linkflags=-stdlib=libc++ ) endif( ) include( ProcessorCount ) ProcessorCount( Cores ) if( NOT Cores EQUAL 0 ) # Travis can fail to build Boost sporadically; uses 32 cores, reduce stress on VM if( DEFINED ENV{TRAVIS} ) if( Cores GREATER 8 ) set( Cores 8 ) endif( ) endif( ) # Add build thread in addition to the number of cores that we have math( EXPR Cores "${Cores} + 1 " ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread set( Cores "2" ) endif( ) message( STATUS "ExternalBoost using ( " ${Cores} " ) cores to build with" ) message( STATUS "ExternalBoost building [ serialization, filesystem, system, regex ] components" ) list( APPEND Boost.Command -j ${Cores} --with-serialization --with-filesystem --with-system --with-regex ) if( BUILD_64 ) list( APPEND Boost.Command address-model=64 ) else( ) list( APPEND Boost.Command address-model=32 ) endif( ) if( MSVC10 ) list( APPEND Boost.Command toolset=msvc-10.0 ) elseif( MSVC11 ) list( APPEND Boost.Command toolset=msvc-11.0 ) elseif( MSVC12 ) list( APPEND Boost.Command toolset=msvc-12.0 ) elseif( MSVC14 ) list( APPEND Boost.Command toolset=msvc-14.0 ) elseif( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Command toolset=clang ) elseif( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command toolset=gcc ) endif( ) if( WIN32 AND (ext.Boost_VERSION VERSION_LESS "1.60.0") ) list( APPEND Boost.Command define=BOOST_LOG_USE_WINNT6_API ) endif( ) if( NOT DEFINED ext.Boost_LINK ) if( ${BUILD_SHARED_LIBS} MATCHES "ON" ) set( ext.Boost_LINK "shared" CACHE STRING "Which boost link method? static | shared | static,shared" ) else( ) set( ext.Boost_LINK "static" CACHE STRING "Which boost link method? static | shared | static,shared" ) endif( ) endif() mark_as_advanced( ext.Boost_LINK ) if( WIN32 ) # Versioned is the default on windows set( ext.Boost_LAYOUT "versioned" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For windows, default to build both variants to support the VS IDE set( ext.Boost_VARIANT "debug,release" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) # Tagged builds provide unique enough names to be able to build both variants set( ext.Boost_LAYOUT "tagged" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For Linux, typically a build tree only needs one variant if( ${CMAKE_BUILD_TYPE} MATCHES "Debug") set( ext.Boost_VARIANT "debug" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) set( ext.Boost_VARIANT "release" CACHE STRING "Which boost variant? debug | release | debug,release" ) endif( ) endif( ) mark_as_advanced( ext.Boost_LAYOUT ) mark_as_advanced( ext.Boost_VARIANT ) list( APPEND Boost.Command --layout=${ext.Boost_LAYOUT} link=${ext.Boost_LINK} variant=${ext.Boost_VARIANT} ) message( STATUS "Boost.Command: ${Boost.Command}" ) # If the user has a cached local copy stored somewhere, they can define the full path to the package in a BOOST_URL environment variable if( DEFINED ENV{BOOST_URL} ) set( ext.Boost_URL "$ENV{BOOST_URL}" CACHE STRING "URL to download Boost from" ) else( ) set( ext.Boost_URL "http://sourceforge.net/projects/boost/files/boost/${ext.Boost_VERSION}/boost_${ext.Boost_Version_Underscore}.${Boost_Ext}/download" CACHE STRING "URL to download Boost from" ) endif( ) mark_as_advanced( ext.Boost_URL ) set( Boost.Bootstrap "" ) set( ext.HASH "" ) if( WIN32 ) set( Boost.Bootstrap "bootstrap.bat" ) if( CMAKE_VERSION VERSION_LESS "3.1.0" ) # .zip file set( ext.HASH "b99973c805f38b549dbeaf88701c0abeff8b0e8eaa4066df47cac10a32097523" ) else( ) # .7z file set( ext.HASH "49c6abfeb5b480f6a86119c0d57235966b4690ee6ff9e6401ee868244808d155" ) endif( ) else( ) set( Boost.Bootstrap "./bootstrap.sh" ) # .tar.bz2 set( ext.HASH "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332" ) if( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Bootstrap --with-toolset=clang ) endif( ) endif( ) # Below is a fancy CMake command to download, build and install Boost on the users computer ExternalProject_Add( boost PREFIX ${CMAKE_BINARY_DIR}/boost URL ${ext.Boost_URL} URL_HASH SHA256=${ext.HASH} UPDATE_COMMAND ${Boost.Bootstrap} LOG_UPDATE 1 CONFIGURE_COMMAND "" BUILD_COMMAND ${Boost.Command} stage BUILD_IN_SOURCE 1 LOG_BUILD 1 INSTALL_COMMAND "" ) set_property( TARGET boost PROPERTY FOLDER "extern" ) ExternalProject_Get_Property( boost install_dir ) ExternalProject_Get_Property( boost binary_dir ) # For use by the user of ExternalGtest.cmake set( BOOST_INSTALL_ROOT ${install_dir} ) set( BOOST_BINARY_ROOT ${binary_dir} ) rocFFT-rocm-6.4.3/docs/000077500000000000000000000000001501537341300145375ustar00rootroot00000000000000rocFFT-rocm-6.4.3/docs/.gitignore000066400000000000000000000001061501537341300165240ustar00rootroot00000000000000_build/ _doxygen/ _toc.yml doxygen/html doxygen/rtf doxygen/xml rocFFT-rocm-6.4.3/docs/Makefile000066400000000000000000000011331501537341300161750ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = rocFFT SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)rocFFT-rocm-6.4.3/docs/classification-map.xml000066400000000000000000000211301501537341300210240ustar00rootroot00000000000000 Kanika Yadav (external) Microsoft Office User 2020-09-25T06:54:04Z 2021-12-22T19:07:50Z 16.00 true 2021-02-23T09:13:03Z Standard 90c2fedb-0da6-4717-8531-d16a1b9930f4 45597f60-6e37-4be7-acfb-4c9e23b261ea 0 true 2022-01-14T16:33:39Z Privileged AMD Official Use Only-AIP 2.0 3dd8961f-e488-4e60-8e11-a82d994e183d 3ab6c0f7-c658-4f6f-bd9d-6ef921551ff7 1 14235 32767 32767 32767 False False Filename Title Categories Version Doc Type MAP rocm;hip-sdk;hip;gpu;amd;rocfft;fft 4-5 apply-ALL default rocFFT API Guide reference