pax_global_header00006660000000000000000000000064150153734040014514gustar00rootroot0000000000000052 comment=39982887e9dc77e1057a8fc2485bdee05e5aa87d hipFFT-rocm-6.4.3/000077500000000000000000000000001501537340400136045ustar00rootroot00000000000000hipFFT-rocm-6.4.3/.azuredevops/000077500000000000000000000000001501537340400162315ustar00rootroot00000000000000hipFFT-rocm-6.4.3/.azuredevops/rocm-ci.yml000066400000000000000000000012401501537340400203020ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/hipFFT.yml@pipelines_repo hipFFT-rocm-6.4.3/.clang-format000066400000000000000000000065421501537340400161660ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: All ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- hipFFT-rocm-6.4.3/.githooks/000077500000000000000000000000001501537340400155115ustar00rootroot00000000000000hipFFT-rocm-6.4.3/.githooks/install000077500000000000000000000002221501537340400171010ustar00rootroot00000000000000#!/usr/bin/env bash cd $(git rev-parse --git-dir) cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" hipFFT-rocm-6.4.3/.githooks/pre-commit000077500000000000000000000017661501537340400175250ustar00rootroot00000000000000#!/bin/sh # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. base=/opt/rocm/hcc/bin/clang-format format="" # Redirect output to stderr. exec 1>&2 # check if clang-format is installed type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then echo "$base is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) if git rev-parse --verify HEAD >/dev/null 2>&1 then against=HEAD else # Initial commit: diff against an empty tree object against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then echo "$format $file" "$format" -i -style=file "$file" fi done hipFFT-rocm-6.4.3/.github/000077500000000000000000000000001501537340400151445ustar00rootroot00000000000000hipFFT-rocm-6.4.3/.github/CODEOWNERS000077500000000000000000000005761501537340400165520ustar00rootroot00000000000000* @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation # Header directory for Doxygen documentation library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts hipFFT-rocm-6.4.3/.github/CONTRIBUTING.md000066400000000000000000000151701501537340400174010ustar00rootroot00000000000000 # Contributing to hipFFT # We welcome contributions to hipFFT. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgment for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## When a contribution is submitted via a pull request, a number of automated checks are run in order to verify compilation correctness and prevent performance regressions. These checks include: * Building and testing the change on various OS platforms (Ubuntu, RHEL, etc.) * Running on different AMD GPU architectures (MI-series, Radeon series cards, etc.) * Running on different NVIDIA GPU architectures (V100, A100, etc) * Running benchmarks to check for performance degradation In order for a submission to be accepted: * It must pass all of the automated checks * It must undergo a code review Users can visualize our continuous integration infrastructure in: `hipFFT/.jenkins`. The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes before raising pull requests. ## Code Structure ## In a broad view, hipFFT library is structured as follows: ├── docs/: contains hipFFT documentation ├── library/: contains main source code and headers │   ├── src/amd_detail/ : for porting to AMD devices │   ├── src/nvidia_detail/ : for porting to NVIDIA devices ├── clients/: │   ├── bench/ : contains benchmarking code │   ├── samples/ : contains examples │   ├── tests/ : contains our test infrastructure ├── shared/: contains important global headers and those for linking to other applications ## Coding Style ## * All public APIs are C89 compatible; all other library code should use c++17. * Our minimum supported compiler is clang 3.6. * Avoid CamelCase: rule applies specifically to publicly visible APIs, but is encouraged (not mandated) for internal code. * C and C++ code should be formatted using `clang-format`. You can use the clang-format version available in `hipFFT/.clang-format`. To format a C/C++ file, use: ``` clang-format -style=file -i ``` * Python code should use: ``` yapf --style pep8 ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. Note that a [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. The following guidelines apply: * When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch. * Note that releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers. * Ensure code builds successfully. * Do not break existing test cases * Code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. ### Deliverables ### New changes should include test coverage. Our testing infrastructure is located in `clients/tests/`, and can be used as a reference. The following guidelines apply: * New functionality will only be merged with new unit tests. * New unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). * Tests must have good code coverage. ### Process ### All pull requests must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Once a contribution is ready to be submitted, consider the following: * Before you create a PR, ensure that all files have been gone through the clang formatting: clang-format -i * While creating a PR, you can take a look at a `diff` of the changes you made using the PR's "Files" tab, and verify that no unintentional changes are being submitted. * Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. * During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. * When a modification request has been completed, the conversation thread about it will be marked as resolved. * To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. * Once your contribution is approved, we will use the *squash merge* option from GitHub to integrate it to the corresponding branch. ## Code License ## All code contributed to this project will be licensed under the license identified in the [LICENSE.md](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md). Your contribution will be accepted under the same license. hipFFT-rocm-6.4.3/.github/ISSUE_TEMPLATE.md000066400000000000000000000004611501537340400176520ustar00rootroot00000000000000### What is the expected behavior - ### What actually happens - ### How to reproduce - ### Environment | Hardware | description | |-----|-----| | GPU | device string | | CPU | device string | | Software | version | |-----|-----| | ROCK | v0.0 | | ROCR | v0.0 | | HCC | v0.0 | | Library | v0.0 | hipFFT-rocm-6.4.3/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000000701501537340400207420ustar00rootroot00000000000000resolves #___ Summary of proposed changes: - - - hipFFT-rocm-6.4.3/.github/dependabot.yml000066400000000000000000000013001501537340400177660ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "monthly" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" - "malcolmroberts" - "evetsso" hipFFT-rocm-6.4.3/.gitignore000066400000000000000000000006461501537340400156020ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Visual Studio Code .vscode # documentation artifacts build/ _build/ _images/ _static/ _templates/ _toc.yml docBin/ # python bytecode __pycache__ hipFFT-rocm-6.4.3/.jenkins/000077500000000000000000000000001501537340400153235ustar00rootroot00000000000000hipFFT-rocm-6.4.3/.jenkins/common.groovy000066400000000000000000000071511501537340400200660ustar00rootroot00000000000000import static groovy.io.FileType.FILES def runCompileCommand(platform, project, jobName, boolean sameOrg = false) { project.paths.construct_build_prefix() def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, sameOrg) } } String cmake = platform.jenkinsLabel.contains('centos') ? "cmake3" : "cmake" String warningArgs = platform.jenkinsLabel.contains('cuda') ? '':'-DWERROR=ON' String hipClang = platform.jenkinsLabel.contains('hipClang') ? "HIP_COMPILER=clang" : "" String path = platform.jenkinsLabel.contains('centos7') ? "export PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH" : ":" String dir = jobName.contains('Debug') ? "debug" : "release" // hipcc with CUDA backend needs HIP_PLATFORM set accordingly in the environment String hipPlatformCommand = platform.jenkinsLabel.contains("cuda") ? "export HIP_PLATFORM=nvidia" : "" def command = """#!/usr/bin/env bash set -x ls /fftw/lib export FFTW_ROOT=/fftw export FFTW_INCLUDE_PATH=\${FFTW_ROOT}/include export FFTW_LIB_PATH=\${FFTW_ROOT}/lib export LD_LIBRARY_PATH=\${FFTW_LIB_PATH}:/opt/rocm/lib:/opt/rocm/hip/lib export CPLUS_INCLUDE_PATH=\${FFTW_INCLUDE_PATH}:\${CPLUS_INCLUDE_PATH} export CMAKE_PREFIX_PATH=\${FFTW_LIB_PATH}/cmake/fftw3:\${CMAKE_PREFIX_PATH} export CMAKE_PREFIX_PATH=\${FFTW_LIB_PATH}/cmake/fftw3f:\${CMAKE_PREFIX_PATH} # default container flags cause problems for CUDA backend, and aren't useful for ROCm unset HIPCC_COMPILE_FLAGS_APPEND unset HIPCC_LINK_FLAGS_APPEND ${hipPlatformCommand} cd ${project.paths.project_build_prefix} mkdir -p build/${dir} && cd build/${dir} ${getDependenciesCommand} ${path} ${hipClang} ${cmake} ${warningArgs} ${project.paths.build_command} make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project, gfilter) { String cudaArgs = platform.jenkinsLabel.contains('cuda') ? '--double_epsilon=5e-11' : '--precompile=rocfft-test-precompile.db' def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix}/build/release/clients/staging GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./hipfft-test ${cudaArgs} --gtest_output=xml --gtest_color=yes --gtest_filter=${gfilter} """ platform.runCommand(this, command) //junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" } def runPackageCommand(platform, project, jobName, label='') { def command label = label != '' ? '-' + label.toLowerCase() : '' String ext = platform.jenkinsLabel.contains('ubuntu') ? "deb" : "rpm" String dir = jobName.contains('Debug') ? "debug" : "release" command = """ set -x cd ${project.paths.project_build_prefix}/build/${dir} make package mkdir -p package for f in hipfft*.$ext do mv "\$f" "hipfft${label}-\${f#*-}" done mv *.${ext} package/ """ platform.runCommand(this, command) platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/${dir}/package/*.${ext}""") } return this hipFFT-rocm-6.4.3/.jenkins/debug.groovy000066400000000000000000000052531501537340400176650ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName, buildCommand -> def prj = new rocProject('hipFFT', 'Debug') // customize for project prj.paths.build_command = buildCommand prj.libraryDependencies = ['rocRAND', 'rocFFT', 'hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> project.paths.construct_build_prefix() commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, null, null) } def setupCI(urlJobName, jobNameList, buildCommand, runCI, label) { jobNameList = auxiliary.appendJobNameList(jobNameList) jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(label + ' ' + jobName) { runCI(nodeDetails, jobName, buildCommand, label) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(label + ' ' + urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName, buildCommand, label) } } } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } def hostJobNameList = ["compute-rocm-dkms-no-npi":([ubuntu18:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx906']])] def hipClangJobNameList = ["compute-rocm-dkms-no-npi":([ubuntu18:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx906']])] String hostBuildCommand = '-DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Debug -L ../..' String hipClangBuildCommand = '-DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_BUILD_TYPE=Debug -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_SAMPLES=ON -L ../..' setupCI(urlJobName, hostJobNameList, hostBuildCommand, runCI, 'g++') setupCI(urlJobName, hipClangJobNameList, hipClangBuildCommand, runCI, 'hip-clang') } hipFFT-rocm-6.4.3/.jenkins/multigpu.groovy000066400000000000000000000053711501537340400204460ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName, buildCommand, label, runTest -> def prj = new rocProject('hipFFT', 'multigpu') // customize for project prj.paths.build_command = buildCommand prj.libraryDependencies = ['rocRAND', 'rocFFT', 'hipRAND'] prj.timeout.test = 360 // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> project.paths.construct_build_prefix() commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project,jobName) } def testCommand = { platform, project-> def gfilter = "*multi_gpu*" commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName, label) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, runTest ? testCommand : null, packageCommand) } def setupCI(urlJobName, jobNameList, buildCommand, runCI, label, runTest) { jobNameList = auxiliary.appendJobNameList(jobNameList) jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(label + ' ' + jobName) { runCI(nodeDetails, jobName, buildCommand, label, runTest) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(label + ' ' + urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName, buildCommand, label) } } } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["main":[pipelineTriggers([cron('0 6 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["main":([ubuntu20:['8gfx90a']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } String hipClangBuildCommand = '-DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_SAMPLES=ON -L ../..' setupCI(urlJobName, jobNameList, hipClangBuildCommand, runCI, 'hip-clang', true) } hipFFT-rocm-6.4.3/.jenkins/staticanalysis.groovy000066400000000000000000000015161501537340400216300ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName -> def prj = new rocProject('hipFFT-internal', 'PreCheckin') // customize for project prj.libraryDependencies = ['rocRAND','rocFFT'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true boolean staticAnalysis = true buildProject(prj, formatCheck, nodes.dockerArray, null, null, null, staticAnalysis) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 7')])])) stage(urlJobName) { runCI([ubuntu22:['any']], urlJobName) } } hipFFT-rocm-6.4.3/.jenkins/staticlibrary.groovy000066400000000000000000000057361501537340400214610ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName, buildCommand, label -> def prj = new rocProject('hipFFT-internal', 'StaticLibrary') // customize for project prj.paths.build_command = buildCommand prj.libraryDependencies = ['rocRAND','rocFFT'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> project.paths.construct_build_prefix() commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, true) } def testCommand = { platform, project-> def gfilter = "*" commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName, label) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } def setupCI(urlJobName, jobNameList, buildCommand, runCI, label) { jobNameList = auxiliary.appendJobNameList(jobNameList) jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(label + ' ' + jobName) { runCI(nodeDetails, jobName, buildCommand, label) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(label + ' ' + urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName, buildCommand, label) } } } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } def hostJobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] def hipClangJobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] String hostBuildCommand = '-DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_SHARED_LIBS=OFF -L ../..' String hipClangBuildCommand = '-DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_SHARED_LIBS=OFF -L ../..' setupCI(urlJobName, hostJobNameList, hostBuildCommand, runCI, 'g++') setupCI(urlJobName, hipClangJobNameList, hipClangBuildCommand, runCI, 'hip-clang') } hipFFT-rocm-6.4.3/.readthedocs.yaml000066400000000000000000000005021501537340400170300ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "3.10" hipFFT-rocm-6.4.3/CHANGELOG.md000066400000000000000000000132421501537340400154170ustar00rootroot00000000000000# Changelog for hipFFT Documentation for hipFFT is available at [https://rocm.docs.amd.com/projects/hipFFT/en/latest/](https://rocm.docs.amd.com/projects/hipFFT/en/latest/). ## hipFFT 1.0.18 for ROCm 6.4.0 ### Added * Implemented the `hipfftMpAttachComm`, `hipfftXtSetDistribution`, and `hipfftXtSetSubformatDefault` APIs to allow computing FFTs that are distributed between multiple MPI (Message Passing Interface) processes. These APIs can be enabled with the `HIPFFT_MPI_ENABLE` CMake option, which defaults to `OFF`. The backend FFT library called by hipFFT must support MPI for these APIs to work. ### Changed * Building with the address sanitizer option sets xnack+ for the relevant GPU architectures. * Use find_package CUDAToolkit instead of CUDA in cmake for modern-cmake compatibility. * The `AMDGPU_TARGETS` build variable should be replaced with `GPU_TARGETS`. `AMDGPU_TARGETS` is deprecated. ### Resolved issues * Fixed client packages to depend on hipRAND instead of rocRAND. ## hipFFT 1.0.17 for ROCm 6.3.0 ### Added * Support for the gfx1151, gfx1200, and gfx1201 architectures * hipfft-test now includes a --smoketest option. ### Changed * The AMD backend is now compiled using amdclang++ instead of hipcc. The NVIDIA CUDA backend still uses hipcc-nvcc. * CLI11 replaces Boost Program Options as the command line parser for clients. ## hipFFT 1.0.16 for ROCm 6.2.4 ### Changed * Support gfx1151 architecture. ## hipFFT 1.0.15 for ROCm 6.2.0 ### Fixes * Added hip::host as a public link library, as hipfft.h includes HIP runtime headers. * Prevent C++ exceptions leaking from public API functions. * Make output of hipfftXt match cufftXt in geometry and alignment for 2D and 3D FFTs. ## hipFFT 1.0.14 for ROCm 6.1.0 ### Changes * When building hipFFT from source, rocFFT code no longer needs to be initialized as a git submodule. ### Fixes * Fixed error when creating length-1 plans. ## hipFFT 1.0.13 for ROCm 6.0.0 ### Changes * `hipfft-rider` has been renamed to `hipfft-bench`; it is controlled by the `BUILD_CLIENTS_BENCH` CMake option (note that a link for the old file name is installed, and the old `BUILD_CLIENTS_RIDER` CMake option is accepted for backwards compatibility, but both will be removed in a future release) * Binaries in debug builds no longer have a `-d` suffix * The minimum rocFFT required version has been updated to 1.0.21 ### Additions * `hipfftXtSetGPUs`, `hipfftXtMalloc, hipfftXtMemcpy`, `hipfftXtFree`, and `hipfftXtExecDescriptor` APIs have been implemented to allow FFT computing on multiple devices in a single process ## hipFFT 1.0.12 for ROCm 5.6.0 ### Additions * `hipfftXtMakePlanMany`, `hipfftXtGetSizeMany`, and `hipfftXtExec` APIs have been implemented to allow half-precision transform requests ### Changes * Added the `--precision` argument to benchmark and test clients (`--double` is still accepted, but has been deprecated as a method to request a double-precision transform) ## hipFFT 1.0.11 for ROCm 5.5.0 ### Fixes * Fixed old version ROCm include and lib folders that were not removed during upgrades ## hipFFT 1.0.10 for ROCm 5.4.0 ### Additions * Added the `hipfftExtPlanScaleFactor` API to efficiently multiply each output element of an FFT by a given scaling factor (result scaling must be supported in the backend FFT library) ### Changes * rocFFT 1.0.19 or higher is now required for hipFFT builds on the rocFFT backend * Data are initialized directly on GPUs using hipRAND * Updated build files now use standard C++17 ## hipFFT 1.0.9 for ROCm 5.3.0 ### Changes * Cleaned up build warnings * GNUInstallDirs enhancements * GoogleTest 1.11 is required ## hipFFT 1.0.8 for ROCm 5.2.0 ### Additions * Added file and folder reorganization changes with backward compatibility support when using rocm-cmake wrapper functions * New packages for test and benchmark executables on all supported operating systems that use CPack * Implemented `hipfftMakePlanMany64` and `hipfftGetSizeMany64` ## hipFFT 1.0.7 for ROCm 5.1.0 ### Changes * Use `fft_params` struct for accuracy and benchmark clients ## hipFFT 1.0.6 for ROCm 5.0.0 ### Fixes * Incorrect reporting of rocFFT version ### Changes * Unconditionally enabled callback functionality: On the CUDA backend, callbacks only run correctly when hipFFT is built as a static library, and linked against the static cuFFT library ## hipFFT 1.0.5 for ROCm 4.5.0 ### Additions * Added support for Windows 10 as a build target ### Changes * Packaging has been split into a runtime package (`hipfft`) and a development package (`hipfft-devel`): The development package depends on the runtime package. When installing the runtime package, the package manager will suggest the installation of the development package to aid users transitioning from the previous version's combined package. This suggestion by package manager is for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` feature in the runtime package is introduced as a deprecated feature and will be removed in a future ROCm release. ## hipFFT 1.0.4 for ROCm 4.4.0 ### Fixes * Add calls to rocFFT setup and cleanup * CMake fixes for clients and backend support ### Additions * Added support for Windows 10 as a build target ## hipFFT 1.0.3 for ROCm 4.3.0 ### Fixes * CMake updates ### Additions * New callback API in `hipfftXt.h` header ## hipFFT 1.0.2 for ROCm 4.2.0 * No changes ## hipFFT 1.0.1 for ROCm 4.1.0 ### Fixes * Batch support for `hipfftMakePlanMany` * Work area handling during plan creation and `hipfftSetWorkArea` * Honour `autoAllocate` flag ### Changes * Testing infrastructure reuses code from [rocFFT](https://github.com/ROCmSoftwarePlatform/rocFFT) hipFFT-rocm-6.4.3/CMakeLists.txt000066400000000000000000000252121501537340400163460ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################ # CMake version according to latest ROCm platform requirements cmake_minimum_required( VERSION 3.17 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) set(CMAKE_CXX_EXTENSIONS OFF) # Consider removing this in the future # This should appear before the project command, because it does not use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else() set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif() # Workarounds.. list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm ) list( APPEND CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip /opt/rocm/lib/cmake/hip /opt/rocm/hip/cmake ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. # MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( HIPFFT_BUILD_SCOPE ON ) project( hipfft LANGUAGES CXX ) # Build options option( BUILD_SHARED_LIBS "Build ${PROJECT_NAME} as a shared library" ON ) option( BUILD_VERBOSE "Output additional build information" OFF ) option( HIPFFT_MPI_ENABLE "Build with MPI support for distributed transforms" OFF ) set( BUILD_WITH_COMPILER "HOST-default" CACHE INTERNAL "Build ${PROJECT_NAME} with compiler HIP-clang, HIP-nvcc, or just the host default compiler, eg g++") set( BUILD_WITH_LIB "ROCM" CACHE STRING "Build ${PROJECT_NAME} with ROCM or CUDA libraries" ) option( BUILD_CLIENTS "Build all clients" OFF) option( BUILD_CLIENTS_BENCH "Build benchmark client" OFF ) option( BUILD_CLIENTS_TESTS "Build ${PROJECT_NAME} tests (requires 3rd dependencies)" OFF ) option( BUILD_CLIENTS_SAMPLES "Build examples" OFF ) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) # Provide ability to disable hipRAND dependency option(USE_HIPRAND "Build using hipRAND for test input generation instead of host-side generation" ON) if( USE_HIPRAND ) add_compile_definitions(USE_HIPRAND) endif( ) option( WERROR "Treat warnings as errors" OFF ) set(DEFAULT_GPUS gfx803 gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102 gfx1151 gfx1200 gfx1201) if(BUILD_ADDRESS_SANITIZER) add_compile_options(-fsanitize=address) add_link_options(-fsanitize=address) add_link_options(-shared-libasan) SET(DEFAULT_GPUS gfx908:xnack+ gfx90a:xnack+ gfx940:xnack+ gfx941:xnack+ gfx942:xnack+) add_link_options(-fuse-ld=lld) add_compile_definitions(ADDRESS_SANITIZER) endif() # Set internal BUILD_WITH_COMPILER. if(NOT (CMAKE_CXX_COMPILER MATCHES ".*hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")) set( BUILD_WITH_COMPILER "HOST-default" ) else() if( $ENV{HIP_PLATFORM} MATCHES "nvidia" ) set( BUILD_WITH_COMPILER "HIP-nvcc" ) else() set( BUILD_WITH_COMPILER "HIP-clang" ) if( NOT BUILD_WITH_LIB STREQUAL "ROCM" ) message( FATAL_ERROR "Detected HIP_COMPILER=clang, but BUILD_WITH_LIB is not ROCM!" ) endif() endif() endif() string( TOUPPER "${BUILD_WITH_COMPILER}" BUILD_WITH_COMPILER ) string( TOUPPER "${BUILD_WITH_LIB}" BUILD_WITH_LIB ) # nvc++ doesn't understand warning flags if( NOT CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+" ) set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wno-unknown-pragmas) if( WERROR ) set( WARNING_FLAGS ${WARNING_FLAGS} -Werror ) endif() endif() # Dependencies include(cmake/dependencies.cmake) if (BUILD_WITH_COMPILER STREQUAL "HIP-NVCC" ) set (BUILD_WITH_LIB "CUDA") set( HIP_PLATFORM "nvidia" ) set( CMAKE_CXX_EXTENSIONS OFF ) set( CMAKE_CXX_COMPILE_OPTIONS_PIC "-Xcompiler=${CMAKE_CXX_COMPILE_OPTIONS_PIC}" ) set( CMAKE_SHARED_LIBRARY_C_FLAGS "-Xlinker=${CMAKE_SHARED_LIBRARY_C_FLAGS}" ) set( CMAKE_SHARED_LIBRARY_CXX_FLAGS "-Xlinker=${CMAKE_SHARED_LIBRARY_CXX_FLAGS}" ) set( CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Xlinker=-soname," ) set( CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Xlinker=-soname," ) set( CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "-Xlinker=-rpath," ) set( CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG "-Xlinker=-rpath," ) set( CMAKE_EXECUTABLE_RUNTIME_C_FLAG "-Xlinker=-rpath," ) set( CMAKE_EXECUTABLE_RUNTIME_CXX_FLAG "-Xlinker=-rpath," ) set( CMAKE_C_COMPILE_OPTIONS_VISIBILITY "-Xcompiler='${CMAKE_C_COMPILE_OPTIONS_VISIBILITY}'" ) set( CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY "-Xcompiler='${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY}'" ) set( CMAKE_C_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN "-Xcompiler='${CMAKE_C_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN}'" ) set( CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN "-Xcompiler='${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN}'" ) foreach( FLAG IN ITEMS ${WARNING_FLAGS} ) set( NVCC_WARNING_FLAGS ${NVCC_WARNING_FLAGS} "-Xcompiler=${FLAG}" ) endforeach() set( WARNING_FLAGS ${NVCC_WARNING_FLAGS} ) else() # Define GPU targets if(AMDGPU_TARGETS AND NOT GPU_TARGETS) message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." ) endif() set(AMDGPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)") rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}") # Don't force, users should be able to override GPU_TARGETS at the command line if desired set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for") if( BUILD_WITH_COMPILER STREQUAL "HIP-CLANG" ) set( HIP_PLATFORM "amd" ) set( HIP_COMPILER "clang" ) endif() endif() # Show the actual compiler(internal option) message(STATUS "BUILD_WITH_COMPILER = " ${BUILD_WITH_COMPILER}) # FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( ${CMAKE_SOURCE_DIR}/library/include PATTERNS "*.h" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} ) endif() # Version set( VERSION_STRING "1.0.18" ) set( hipfft_SOVERSION 0.1 ) if( ROCM_FOUND ) rocm_setup_version( VERSION ${VERSION_STRING} ) endif() add_subdirectory( library ) # Build clients of the library if( BUILD_CLIENTS ) set( BUILD_CLIENTS_BENCH ON ) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) endif() # old name for BUILD_CLIENTS_BENCH if( BUILD_CLIENTS_RIDER ) set( BUILD_CLIENTS_BENCH ${BUILD_CLIENTS_RIDER} ) endif() # Build clients of the library if( BUILD_CLIENTS_BENCH OR BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS ) include( clients/cmake/build-options.cmake ) rocm_package_setup_component(clients) if(NOT CLIENTS_OS) rocm_set_os_id(CLIENTS_OS) string(TOLOWER "${CLIENTS_OS}" CLIENTS_OS) rocm_read_os_release(CLIENTS_OS_VERSION VERSION_ID) endif() message(STATUS "OS: ${CLIENTS_OS} ${CLIENTS_OS_VERSION}") set(FFTW_DEB "libfftw3-bin") if(CLIENTS_OS STREQUAL "sles") set(FFTW_RPM "libfftw3-3") elseif(CLIENTS_OS STREQUAL "mariner") set(BOOST_RPM RPM "boost = ${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}") set(FFTW_RPM "fftw-libs") else() set(FFTW_RPM "fftw-libs") endif() if( USE_HIPRAND ) set( HIPRAND_DEP hiprand ) endif() if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component( tests DEPENDS DEB ${FFTW_DEB} ${HIPRAND_DEP} RPM ${FFTW_RPM} ${HIPRAND_DEP} ) endif() if(BUILD_CLIENTS_BENCH) rocm_package_setup_client_component( benchmarks DEPENDS DEB ${HIPRAND_DEP} RPM ${HIPRAND_DEP} ) endif() add_subdirectory( clients ) endif() # Packaging... if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) endif() if( ROCM_FOUND ) # Package specific CPACK vars if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) rocm_package_add_dependencies(DEPENDS "rocfft >= 1.0.21") else() if( NVHPC_FOUND ) string( REPLACE "." "-" NVHPC_PKG_VERSION ${NVHPC_VERSION} ) rocm_package_add_dependencies(DEPENDS "nvhpc-${NVHPC_PKG_VERSION}") else() rocm_package_add_dependencies(DEPENDS "cufft >= 10.0.0") endif() endif() set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) # Give hipfft compiled for CUDA backend a different name if( BUILD_WITH_LIB STREQUAL "ROCM" ) set( package_name hipfft ) else() set( package_name hipfft-alt ) endif() set( HIPFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" ) rocm_create_package( NAME ${package_name} DESCRIPTION "ROCm FFT marshalling library" MAINTAINER "hipfft-maintainer@amd.com" LDCONFIG LDCONFIG_DIR ${HIPFFT_CONFIG_DIR} ) endif() hipFFT-rocm-6.4.3/CppCheckSuppressions.txt000066400000000000000000000001531501537340400204620ustar00rootroot00000000000000// has some false positives and isn't hard to run manually for periodic // dead code sweeps unusedFunction hipFFT-rocm-6.4.3/LICENSE.md000066400000000000000000000054131501537340400152130ustar00rootroot00000000000000MIT License Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. This product includes software from copyright holders as shown below, and distributed under their license terms as specified. CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry Schreiner under NSF AWARD 1414736. All rights reserved. Redistribution and use in source and binary forms of CLI11, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. hipFFT-rocm-6.4.3/README.md000066400000000000000000000071571501537340400150750ustar00rootroot00000000000000# hipFFT hipFFT is an FFT marshalling library that supports [rocFFT](https://github.com/ROCmSoftwarePlatform/rocFFT) and [cuFFT](https://developer.nvidia.com/cufft) backends. hipFFT exports an interface that doesn't require the client to change, regardless of the chosen backend. It sits between your application and the backend FFT library, where it marshals inputs to the backend and marshals results back to your application. ## Documentation > [!NOTE] > The published hipFFT documentation is available at [hipFFT](https://rocm.docs.amd.com/projects/hipFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the hipFFT/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). To build our documentation locally, run the following code: ```bash cd docs pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Build and install You can download pre-built packages from the [ROCm package servers](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html). If you're using Ubuntu, you can run: `sudo apt update && sudo apt install hipfft`. ### Building from source To build hipFFT from source, follow these steps: 1. Install the library build dependencies: * On AMD platforms, you must install [rocFFT](https://github.com/ROCmSoftwarePlatform/rocFFT). * On NVIDIA platforms, you must install [cuFFT](https://developer.nvidia.com/cufft). 2. Install the client build dependencies: * The clients (samples, tests, etc) included with the hipFFT source depend on hipRAND, FFTW and GoogleTest. 3. Build hipFFT: To show all build options: ```bash mkdir build && cd build cmake -LH .. ``` Here are some CMake build examples: * AMD GPU * Case: Build a project using HIP language APIs + hipFFT with standard host compiler * Code: `cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -L ..` * Case: Build a project using HIP language APIs + hipFFT + device kernels with HIP-Clang * Code: `cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_BUILD_TYPE=Release -DBUILD_CLIENTS=ON -L ..` * NVIDIA GPU * Case: Build a project using HIP language APIs + hipFFT with standard host compiler * Code: `cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -DBUILD_WITH_LIB=CUDA -L ..` * Case: Build a project using HIP language APIs + hipFFT + device kernels with HIP-NVCC * Code: `HIP_PLATFORM=nvidia cmake -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_BUILD_TYPE=Release -DBUILD_CLIENTS=ON -L ..` ```note The `-DBUILD_CLIENTS=ON` option is only allowed with the amdclang++ or HIPCC compilers. ``` ## Porting from CUDA If you have existing CUDA code and want to transition to HIP, follow these steps: 1. [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY) your code and fix all unsupported CUDA features and user-defined macros 2. Build with HIP-NVCC to run on an NVIDIA device 3. Build with HIP-Clang to run on an AMD device More information about porting to HIP is available in the [HIP porting guide](https://rocm.docs.amd.com/projects/HIP/en/develop/user_guide/hip_porting_guide.html). ## Support You can report bugs and feature requests through the GitHub [issue tracker](https://github.com/ROCm/hipFFT/issues). ## Contribute If you want to contribute to hipFFT, you must follow our [contribution guidelines](https://github.com/ROCm/hipFFT/blob/develop/.github/CONTRIBUTING.md). hipFFT-rocm-6.4.3/clients/000077500000000000000000000000001501537340400152455ustar00rootroot00000000000000hipFFT-rocm-6.4.3/clients/CMakeLists.txt000066400000000000000000000110061501537340400200030ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# # CMake version according to latest ROCm platform requirements cmake_minimum_required( VERSION 3.16 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) set(CMAKE_CXX_EXTENSIONS OFF) # Consider removing this in the future # This should appear before the project command, because it does not use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does # not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( HIPFFT_CLIENTS_BUILD_SCOPE ON ) # This project may compile dependencies for clients project( hipfft-clients LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) include( build-options ) if(NOT (CMAKE_CXX_COMPILER MATCHES ".*hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+" OR CMAKE_CXX_COMPILER MATCHES ".*nvcc" OR CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+" ) ) if(BUILD_CLIENTS) message( FATAL_ERROR "Using BUILD_CLIENTS=ON requires a compiler capable of building device code (hipcc, clang, nvcc, nvc++)." ) endif() endif() # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on # all the time # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim set( CMAKE_EXPORT_COMPILE_COMMANDS ON ) # if hipfft is not a target, then we know clients are built separately from the library and we must # search for the hipfft package if( NOT TARGET hipfft ) find_package( hipfft REQUIRED CONFIG PATHS ) endif( ) if( BUILD_CLIENTS_SAMPLES ) add_subdirectory( samples ) endif( ) if( BUILD_CLIENTS_TESTS ) find_package( GTest 1.11.0 ) include( ExternalProject ) if( NOT GTEST_FOUND ) set( GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include ) set( GTEST_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX} ) ExternalProject_Add( gtest URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz URL_HASH SHA256=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON INSTALL_COMMAND "" BUILD_BYPRODUCTS ${GTEST_LIBRARIES} ) ExternalProject_Get_Property( gtest source_dir binary_dir ) endif() add_subdirectory( tests ) endif( ) if( BUILD_CLIENTS_BENCH ) add_subdirectory( bench ) endif( ) hipFFT-rocm-6.4.3/clients/bench/000077500000000000000000000000001501537340400163245ustar00rootroot00000000000000hipFFT-rocm-6.4.3/clients/bench/CMakeLists.txt000066400000000000000000000107141501537340400210670ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# CMAKE_MINIMUM_REQUIRED(VERSION 3.16) project( hipfft-clients-bench LANGUAGES CXX ) set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp ) set( hipfft_bench_includes bench.h ../../shared/array_validator.h ) add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} ) target_compile_options( hipfft-bench PRIVATE ${WARNING_FLAGS} ) set_target_properties( hipfft-bench PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) target_include_directories( hipfft-bench PRIVATE $ $ $ $ ) if((NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP MODULE REQUIRED ) endif() target_link_libraries( hipfft-bench PRIVATE hip::host hip::device ) else() target_compile_definitions( hipfft-bench PRIVATE __HIP_PLATFORM_NVIDIA__) target_include_directories( hipfft-bench PRIVATE ${HIP_INCLUDE_DIRS}) endif() endif() if ( BUILD_WITH_LIB STREQUAL "CUDA" ) if( CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+$" ) target_compile_options( hipfft-bench PRIVATE -cuda -Xptxas=-w) target_link_options( hipfft-bench PRIVATE -cuda) else() target_compile_options( hipfft-bench PRIVATE -arch sm_53 -gencode=arch=compute_53,code=sm_53 -Xptxas=-w) endif() target_link_libraries( hipfft-bench PRIVATE ${CUDA_LIBRARIES} ) else() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() if( USE_HIPRAND ) target_link_libraries( hipfft-bench PRIVATE hip::hiprand ) endif() endif() target_link_libraries( hipfft-bench PRIVATE hip::hipfft ) set_target_properties( hipfft-bench PROPERTIES CXX_EXTENSIONS NO ) set_target_properties( hipfft-bench PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( HIPFFT_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../staging" ) elseif( HIPFFT_CLIENTS_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../bin" ) else() set( BENCH_OUT_DIR "/bin") endif() string( CONCAT BENCH_OUT_DIR "${PROJECT_BINARY_DIR}" ${BENCH_OUT_DIR} ) set_target_properties( hipfft-bench PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BENCH_OUT_DIR} ) rocm_install(TARGETS hipfft-bench COMPONENT benchmarks) # install compatibility for old name of bench program - symlink on # unix, hardlink on windows (since privilege is required to create # symlinks there) if( WIN32 ) set( BENCH_LINK_COMMAND create_hardlink ) set( BENCH_NEW_NAME ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/hipfft-bench${CMAKE_EXECUTABLE_SUFFIX} ) set( BENCH_OLD_NAME ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/hipfft-rider${CMAKE_EXECUTABLE_SUFFIX} ) else() set( BENCH_LINK_COMMAND create_symlink ) set( BENCH_NEW_NAME hipfft-bench ) set( BENCH_OLD_NAME ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/hipfft-rider ) endif() install( CODE "execute_process( COMMAND \"${CMAKE_COMMAND}\" -E ${BENCH_LINK_COMMAND} \"${BENCH_NEW_NAME}\" \"${BENCH_OLD_NAME}\" )" ) hipFFT-rocm-6.4.3/clients/bench/bench.cpp000066400000000000000000000356521501537340400201220ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include "bench.h" #include "../../shared/CLI11.hpp" #include "../../shared/client_except.h" #include "../../shared/gpubuf.h" int main(int argc, char* argv[]) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio(false); // Control output verbosity: int verbose{}; // hip Device number for running tests: int deviceId{}; // Number of performance trial samples int ntrial{}; // FFT parameters: hipfft_params params; // Token string to fully specify fft params. std::string token; // Declare the supported options. CLI::App app{"hipfft-bench command line options"}; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("-v, --version", "Print queryable version information from the rocfft library") // ->each([](const std::string&) { // char v[256]; // rocfft_get_version_string(v, 256); // std::cout << "version " << v << std::endl; // std::exit(EXIT_SUCCESS); // }) ; CLI::Option* opt_token = app.add_option("--token", token, "Token to read FFT params from")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); CLI::Option* opt_not_in_place = non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); CLI::Option* opt_length = non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3); non_token ->add_option("-b, --batchSize", params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides"); CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides"); non_token->add_option("--idist", params.idist, "Logical distance between input batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; }); non_token->add_option("--odist", params.odist, "Logical distance between output batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; }); CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset"); CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset"); app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0); app.add_option("-N, --ntrial", ntrial, "Trial size for the problem") ->default_val(1) ->each([&](const std::string& val) { std::cout << "Running profile with " << val << " samples\n"; }); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); app.add_option("--isize", params.isize, "Logical size of input buffer"); app.add_option("--osize", params.osize, "Logical size of output buffer"); app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output"); // Parse args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(!token.empty()) { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } } else { if(*opt_not_in_place) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(*opt_length) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(*opt_istride) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ostride) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ioffset) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ooffset) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } } std::cout << std::flush; // Fixme: set the device id properly after the IDs are synced // bewteen hip runtime and rocm-smi. // HIP_V_THROW(hipSetDevice(deviceId), "set device failed!"); params.validate(); if(!params.valid(verbose)) { throw std::runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str() << std::endl; std::cout << "Token: " << params.token() << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; if(hipMemGetInfo(&free, &total) != hipSuccess) throw std::runtime_error("hipMemGetInfo failed"); const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } size_t vram_footprint = 0; try { vram_footprint = params.vram_footprint(); } catch(ROCFFT_SKIP& e) { std::cout << "SKIPPED: " << e.msg << "\n"; return EXIT_SUCCESS; } if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } // Create plans: auto ret = params.create_plan(); if(ret != fft_status_success) throw std::runtime_error("Plan creation failed"); hipError_t hip_rt; // GPU input buffer: auto ibuffer_sizes = params.ibuffer_sizes(); std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { hip_rt = ibuffer[i].alloc(ibuffer_sizes[i]); if(hip_rt != hipSuccess) throw std::runtime_error("Creating input Buffer failed"); pibuffer[i] = ibuffer[i].data(); } // CPU-side input buffer std::vector ibuffer_cpu; auto is_host_gen = (params.igen == fft_input_generator_host || params.igen == fft_input_random_generator_host); #ifdef USE_HIPRAND if(!is_host_gen) { // Input data: params.compute_input(ibuffer); if(verbose > 1) { // Copy input to CPU ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost), "hipMemcpy failed"); } std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } } #endif if(is_host_gen) { // Input data: ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); params.compute_input(ibuffer_cpu); if(verbose > 1) { std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } } // GPU output buffer: std::vector obuffer_data; std::vector* obuffer = &obuffer_data; if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { hip_rt = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_rt != hipSuccess) throw std::runtime_error("Creating output Buffer failed"); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } auto res = params.execute(pibuffer.data(), pobuffer.data()); if(res != fft_status_success) throw std::runtime_error("Execution failed"); // Run the transform several times and record the execution time: std::vector gpu_time(ntrial); hipEvent_t start, stop; hip_rt = hipEventCreate(&start); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventCreate failed"); hip_rt = hipEventCreate(&stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventCreate failed"); for(size_t itrial = 0; itrial < gpu_time.size(); ++itrial) { #ifdef USE_HIPRAND // Compute input on default device if(!is_host_gen) params.compute_input(ibuffer); #endif if(is_host_gen) { for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } } hip_rt = hipEventRecord(start); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventRecord failed"); res = params.execute(pibuffer.data(), pobuffer.data()); hip_rt = hipEventRecord(stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventRecord failed"); hip_rt = hipEventSynchronize(stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventSynchronize failed"); if(res != fft_status_success) throw std::runtime_error("Execution failed"); float time; hip_rt = hipEventElapsedTime(&time, start, stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventElapsedTime failed"); gpu_time[itrial] = time; if(verbose > 2) { auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int idx = 0; idx < output.size(); ++idx) { hip_rt = hipMemcpy( output[idx].data(), pobuffer[idx], output[idx].size(), hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); } std::cout << "GPU output:\n"; params.print_obuffer(output); } } std::cout << "\nExecution gpu time:"; for(const auto& i : gpu_time) { std::cout << " " << i; } std::cout << " ms" << std::endl; } hipFFT-rocm-6.4.3/clients/bench/bench.h000066400000000000000000000054151501537340400175610ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPFFT_BENCH_H #define HIPFFT_BENCH_H #include "../hipfft_params.h" #include "hipfft/hipfft.h" #include // This is used to either wrap a HIP function call, or to explicitly check a variable // for an error condition. If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so only strings // supported inline void hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != hipSuccess) { std::stringstream tmp; tmp << "HIP_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw std::runtime_error(errorm); } } inline void lib_V_Throw(hipfftResult res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != HIPFFT_SUCCESS) { std::stringstream tmp; tmp << "LIB_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw std::runtime_error(errorm); } } #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__) #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__) #endif // HIPFFT_BENCH_H hipFFT-rocm-6.4.3/clients/cmake/000077500000000000000000000000001501537340400163255ustar00rootroot00000000000000hipFFT-rocm-6.4.3/clients/cmake/FindFFTW.cmake000066400000000000000000000105451501537340400207030ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# #if( FFTW_FIND_VERSION VERSION_LESS "3" ) # message( FFTW_FIND_VERION is ${FFTW_FIND_VERSION}) # message( FATAL_ERROR "FindFFTW can not configure versions less than FFTW 3.0.0" ) #endif( ) find_path(FFTW_INCLUDE_DIRS NAMES fftw3.h HINTS ${FFTW_ROOT}/include $ENV{FFTW_ROOT}/include PATHS /usr/include /usr/local/include ) mark_as_advanced( FFTW_INCLUDE_DIRS ) # message( STATUS "FFTW_FIND_COMPONENTS: ${FFTW_FIND_COMPONENTS}" ) # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" ) # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" ) set( FFTW_LIBRARIES "" ) if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE ) find_library( FFTW_LIBRARIES_SINGLE NAMES fftw3f fftw3f-3 fftw3 fftw3-3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library single" ) mark_as_advanced( FFTW_LIBRARIES_SINGLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTWF_OMP_LIBRARY fftw3f_omp ) find_library( FFTWF_THREADS_LIBRARY fftw3f_threads ) if( FFTWF_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTWF_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() endif( ) if( FFTW_FIND_REQUIRED_DOUBLE ) find_library( FFTW_LIBRARIES_DOUBLE NAMES fftw3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library double" ) mark_as_advanced( FFTW_LIBRARIES_DOUBLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTW_OMP_LIBRARY fftw3_omp ) find_library( FFTW_THREADS_LIBRARY fftw3_threads ) if( FFTW_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTW_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() endif( ) include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES ) # assume the threads feature is always enabled on Windows, since it's # not a separate library there if( FFTW_FOUND AND WIN32 ) set( FFTW_MULTITHREAD TRUE ) endif() if( NOT FFTW_FOUND ) message( STATUS "FindFFTW could not find all of the following fftw libraries" ) message( STATUS "${FFTW_FIND_COMPONENTS}" ) else( ) message(STATUS "FindFFTW configured variables:" ) message(STATUS "FFTW_INCLUDE_DIRS: ${FFTW_INCLUDE_DIRS}" ) message(STATUS "FFTW_LIBRARIES: ${FFTW_LIBRARIES}" ) endif() hipFFT-rocm-6.4.3/clients/cmake/build-options.cmake000066400000000000000000000036751501537340400221320ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# # This file is intended to be used in two ways; independently in a stand alone PROJECT # and as part of a superbuild. If the file is included in a stand alone project, the # variables are not expected to be preset, and this will produce options() in the GUI # for the user to examine. If this file is included in a superbuild, the options will be # presented in the superbuild GUI, but then passed into the ExternalProject as -D # parameters, which would already define them. if( NOT BUILD_CLIENTS_TESTS ) option( BUILD_CLIENTS_TESTS "Build hipFFT unit tests" OFF ) endif( ) if( NOT BUILD_CLIENTS_SAMPLES ) option( BUILD_CLIENTS_SAMPLES "Build hipFFT samples" OFF ) endif( ) hipFFT-rocm-6.4.3/clients/hipfft_params.h000066400000000000000000001306051501537340400202460ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPFFT_PARAMS_H #define HIPFFT_PARAMS_H #include #include #include #include #include "../shared/client_except.h" #include "../shared/concurrency.h" #include "../shared/fft_params.h" #include "../shared/hipfft_brick.h" #include "hipfft/hipfft.h" #include "hipfft/hipfftXt.h" #ifdef HIPFFT_MPI_ENABLE #include "hipfft/hipfftMp.h" #include #endif inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val) { switch(val) { case HIPFFT_SUCCESS: return fft_status_success; case HIPFFT_INVALID_PLAN: case HIPFFT_ALLOC_FAILED: return fft_status_failure; case HIPFFT_INVALID_TYPE: case HIPFFT_INVALID_VALUE: case HIPFFT_INVALID_SIZE: case HIPFFT_INCOMPLETE_PARAMETER_LIST: case HIPFFT_INVALID_DEVICE: case HIPFFT_NOT_IMPLEMENTED: case HIPFFT_NOT_SUPPORTED: return fft_status_invalid_arg_value; case HIPFFT_INTERNAL_ERROR: case HIPFFT_EXEC_FAILED: case HIPFFT_SETUP_FAILED: case HIPFFT_UNALIGNED_DATA: case HIPFFT_PARSE_ERROR: return fft_status_failure; case HIPFFT_NO_WORKSPACE: return fft_status_invalid_work_buffer; default: return fft_status_failure; } } inline std::string hipfftResult_string(const hipfftResult_t val) { switch(val) { case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS (0)"; case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN (1)"; case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED (2)"; case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE (3)"; case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE (4)"; case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR (5)"; case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED (6)"; case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED (7)"; case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE (8)"; case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA (9)"; case HIPFFT_INCOMPLETE_PARAMETER_LIST: return "HIPFFT_INCOMPLETE_PARAMETER_LIST (10)"; case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE (11)"; case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR (12)"; case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE (13)"; case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED (14)"; case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED (16)"; default: return "invalid hipfftResult"; } } class hipfft_params : public fft_params { public: // plan handles are pointers for rocFFT backend, and ints for cuFFT #ifdef __HIP_PLATFORM_AMD__ static constexpr hipfftHandle INVALID_PLAN_HANDLE = nullptr; #else static constexpr hipfftHandle INVALID_PLAN_HANDLE = -1; #endif hipfftHandle plan = INVALID_PLAN_HANDLE; // keep track of token to check when attempting to create new plan std::string current_token; // hipFFT has two ways to specify transform type - the hipfftType // enum, and separate hipDataType enums for input/output. // hipfftType has no way to express an fp16 transform, so // hipfft_transform_type will not be set in that case. std::optional hipfft_transform_type; hipDataType inputType = HIP_C_32F; hipDataType outputType = HIP_C_32F; int direction; std::vector int_length; std::vector int_inembed; std::vector int_onembed; std::vector ll_length; std::vector ll_inembed; std::vector ll_onembed; struct hipLibXtDesc_deleter { void operator()(hipLibXtDesc* d) { hipfftXtFree(d); } }; // allocated memory on devices for multi-GPU transforms - inplace // just uses xt_output std::unique_ptr xt_input; std::unique_ptr xt_output; // rocFFT brick decomposition for Xt memory - multi-GPU tests will // confirm that rocFFT's decomposition matches cuFFT's std::vector xt_inBricks; std::vector xt_outBricks; // backend library can write N worksize values for N GPUs, so // allocate a vector for that if necessary std::vector xt_worksize; // pointer we pass to the backend library. By default point to the // single-GPU workbuffer size. size_t* workbuffersize_ptr; hipfft_params() { workbuffersize_ptr = &workbuffersize; } hipfft_params(const fft_params& p) : fft_params(p) { workbuffersize_ptr = &workbuffersize; } ~hipfft_params() { free(); }; void free() { if(plan != INVALID_PLAN_HANDLE) { hipfftDestroy(plan); plan = INVALID_PLAN_HANDLE; } xt_input.reset(); xt_output.reset(); } size_t vram_footprint() override { size_t val = fft_params::vram_footprint(); // auto-allocated plans fail here if not enough VRAM, skip these tests try { if(create_plan() != fft_status_success) { throw std::runtime_error("Plan creation or struct setup failed"); } } catch(fft_params::work_buffer_alloc_failure& e) { val += workbuffersize; std::stringstream msg; msg << "Plan work buffer size (" << val << " bytes raw data) too large for device"; throw ROCFFT_SKIP{msg.str()}; } val += workbuffersize; return val; } fft_status setup_structs() { // set direction switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_real_forward: direction = HIPFFT_FORWARD; break; case fft_transform_type_complex_inverse: case fft_transform_type_real_inverse: direction = HIPFFT_BACKWARD; break; } // set i/o types and transform type switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: { switch(precision) { case fft_precision_half: inputType = HIP_C_16F; outputType = HIP_C_16F; hipfft_transform_type.reset(); break; case fft_precision_single: inputType = HIP_C_32F; outputType = HIP_C_32F; hipfft_transform_type = HIPFFT_C2C; break; case fft_precision_double: inputType = HIP_C_64F; outputType = HIP_C_64F; hipfft_transform_type = HIPFFT_Z2Z; break; } break; } case fft_transform_type_real_forward: { switch(precision) { case fft_precision_half: inputType = HIP_R_16F; outputType = HIP_C_16F; hipfft_transform_type.reset(); break; case fft_precision_single: inputType = HIP_R_32F; outputType = HIP_C_32F; hipfft_transform_type = HIPFFT_R2C; break; case fft_precision_double: inputType = HIP_R_64F; outputType = HIP_C_64F; hipfft_transform_type = HIPFFT_D2Z; break; } break; } case fft_transform_type_real_inverse: { switch(precision) { case fft_precision_half: inputType = HIP_C_16F; outputType = HIP_R_16F; hipfft_transform_type.reset(); break; case fft_precision_single: inputType = HIP_C_32F; outputType = HIP_R_32F; hipfft_transform_type = HIPFFT_C2R; break; case fft_precision_double: inputType = HIP_C_64F; outputType = HIP_R_64F; hipfft_transform_type = HIPFFT_Z2D; break; } break; } default: throw std::runtime_error("Invalid transform type"); } int_length.resize(dim()); int_inembed.resize(dim()); int_onembed.resize(dim()); ll_length.resize(dim()); ll_inembed.resize(dim()); ll_onembed.resize(dim()); switch(dim()) { case 3: ll_inembed[2] = istride[1] / istride[2]; ll_onembed[2] = ostride[1] / ostride[2]; [[fallthrough]]; case 2: ll_inembed[1] = istride[0] / istride[1]; ll_onembed[1] = ostride[0] / ostride[1]; [[fallthrough]]; case 1: ll_inembed[0] = istride[dim() - 1]; ll_onembed[0] = ostride[dim() - 1]; break; default: throw std::runtime_error("Invalid dimension"); } for(size_t i = 0; i < dim(); ++i) { ll_length[i] = length[i]; int_length[i] = length[i]; int_inembed[i] = ll_inembed[i]; int_onembed[i] = ll_onembed[i]; } hipfftResult ret = HIPFFT_SUCCESS; return fft_status_from_hipfftparams(ret); } fft_status create_plan() override { // check if we need to make a new plan if(current_token == token()) { return fft_status_success; } else { if(plan != INVALID_PLAN_HANDLE) { hipfftDestroy(plan); plan = INVALID_PLAN_HANDLE; } } auto fft_ret = setup_structs(); if(fft_ret != fft_status_success) { return fft_ret; } hipfftResult ret{HIPFFT_INTERNAL_ERROR}; switch(get_create_type()) { case PLAN_Nd: { ret = create_plan_Nd(); break; } case PLAN_MANY: { ret = create_plan_many(); break; } case CREATE_MAKE_PLAN_Nd: { ret = create_make_plan_Nd(); break; } case CREATE_MAKE_PLAN_MANY: { ret = create_make_plan_many(); break; } case CREATE_MAKE_PLAN_MANY64: { ret = create_make_plan_many64(); break; } case CREATE_XT_MAKE_PLAN_MANY: { ret = create_xt_make_plan_many(); break; } default: { throw std::runtime_error("no valid plan creation type"); } } // hipFFT can fail plan creation due to allocation failure - // tests are expecting a specific exception in that case, // because the test was unable to run. Doesn't mean the test // case failed. if(ret == HIPFFT_ALLOC_FAILED) throw fft_params::work_buffer_alloc_failure( "plan create failed due to allocation failure"); // store token to check if plan was already made current_token = token(); return fft_status_from_hipfftparams(ret); } void validate_fields() const override { validate_brick_volume(); // multi-process only works with batch-1 FFTs, as hipFFT has // no place in the API to communicate batch indexes for // bricks if(mp_lib != fft_mp_lib_none && nbatch > 1) throw std::runtime_error("multi-process FFTs require batch-1"); // if user provided decomposition if(!ifields.empty() || !ofields.empty()) { // then library-decomposed multi-GPU must not also be requested if(multiGPU > 1) throw std::runtime_error( "cannot request both library-decomposed GPU and user decomposition"); // count bricks per rank std::map rank_ibrick_count; std::map rank_obrick_count; for(const auto& b : ifields.front().bricks) rank_ibrick_count[b.rank]++; for(const auto& b : ofields.front().bricks) rank_obrick_count[b.rank]++; // make sure there's only one input/output brick per rank auto count_is_one = [](const std::pair& entry) { return entry.second == 1; }; if(!std::all_of(rank_ibrick_count.begin(), rank_ibrick_count.end(), count_is_one) || !std::all_of(rank_obrick_count.begin(), rank_obrick_count.end(), count_is_one)) throw std::runtime_error("multiple bricks per rank are not supported"); // also ensure that each input brick maps to an output on same rank if(rank_ibrick_count != rank_obrick_count) throw std::runtime_error("input and output bricks do not match up"); } } fft_status set_callbacks(void* load_cb_host, void* load_cb_data, void* store_cb_host, void* store_cb_data) override { if(run_callbacks) { if(!hipfft_transform_type) throw std::runtime_error("callbacks require a valid hipfftType"); hipfftResult ret{HIPFFT_EXEC_FAILED}; switch(*hipfft_transform_type) { case HIPFFT_R2C: ret = hipfftXtSetCallback(plan, &load_cb_host, HIPFFT_CB_LD_REAL, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_D2Z: ret = hipfftXtSetCallback( plan, &load_cb_host, HIPFFT_CB_LD_REAL_DOUBLE, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX_DOUBLE, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_C2R: ret = hipfftXtSetCallback(plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback(plan, &store_cb_host, HIPFFT_CB_ST_REAL, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_Z2D: ret = hipfftXtSetCallback( plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX_DOUBLE, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_REAL_DOUBLE, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_C2C: ret = hipfftXtSetCallback(plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_Z2Z: ret = hipfftXtSetCallback( plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX_DOUBLE, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX_DOUBLE, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; default: throw std::runtime_error("Invalid execution type"); } } return fft_status_success; } virtual fft_status execute(void** in, void** out) override { return execute(in[0], out[0]); }; fft_status execute(void* ibuffer, void* obuffer) { hipfftResult ret{HIPFFT_EXEC_FAILED}; // if we're doing multi-GPU, we need to use ExecDescriptor // methods to execute. if(multiGPU > 1) { // rotate between generic ExecDescriptor and specific // ExecDescriptorX2Y functions by hashing token (for // stability across reruns of test cases) // // the specific functions are only for the main transform // types expressible through the hipfftType enum bool generic_ExecDescriptor = !hipfft_transform_type || std::hash()(token()) % 2; if(generic_ExecDescriptor) { ret = hipfftXtExecDescriptor(plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get(), direction); } else { switch(*hipfft_transform_type) { case HIPFFT_R2C: ret = hipfftXtExecDescriptorR2C( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_C2R: ret = hipfftXtExecDescriptorC2R( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_C2C: ret = hipfftXtExecDescriptorC2C( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get(), direction); break; case HIPFFT_D2Z: ret = hipfftXtExecDescriptorD2Z( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_Z2D: ret = hipfftXtExecDescriptorZ2D( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_Z2Z: ret = hipfftXtExecDescriptorZ2Z( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get(), direction); } } return fft_status_from_hipfftparams(ret); } // otherwise, we have two ways to execute in hipFFT - // hipfftExecFOO and hipfftXtExec // Transforms that aren't supported by the hipfftType enum // require using the Xt method, but otherwise we hash the // token to decide how to execute this FFT. we want test // cases to rotate between different execution APIs, but we also // need the choice of API to be stable across reruns of the // same test cases. if(!hipfft_transform_type || std::hash()(token()) % 2) { ret = hipfftXtExec(plan, ibuffer, obuffer, direction); } else { try { switch(*hipfft_transform_type) { case HIPFFT_R2C: ret = hipfftExecR2C( plan, (hipfftReal*)ibuffer, (hipfftComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_D2Z: ret = hipfftExecD2Z(plan, (hipfftDoubleReal*)ibuffer, (hipfftDoubleComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_C2R: ret = hipfftExecC2R( plan, (hipfftComplex*)ibuffer, (hipfftReal*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_Z2D: ret = hipfftExecZ2D(plan, (hipfftDoubleComplex*)ibuffer, (hipfftDoubleReal*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_C2C: ret = hipfftExecC2C( plan, (hipfftComplex*)ibuffer, (hipfftComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer), direction); break; case HIPFFT_Z2Z: ret = hipfftExecZ2Z(plan, (hipfftDoubleComplex*)ibuffer, (hipfftDoubleComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer), direction); break; default: throw std::runtime_error("Invalid execution type"); } } catch(const std::exception& e) { std::cerr << e.what() << std::endl; } catch(...) { std::cerr << "unknown exception in execute(void* ibuffer, void* obuffer)" << std::endl; } } return fft_status_from_hipfftparams(ret); } bool is_contiguous() const { // compute contiguous stride, dist and check that the actual // strides/dists match std::vector contiguous_istride = compute_stride(ilength(), {}, placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward); std::vector contiguous_ostride = compute_stride(olength(), {}, placement == fft_placement_inplace && transform_type == fft_transform_type_real_inverse); if(istride != contiguous_istride || ostride != contiguous_ostride) return false; return compute_idist() == idist && compute_odist() == odist; } // stride is row-major like everything else in fft_params. brick // indexes/strides are col-major because those would normally be // passed to rocFFT directly static bool xt_desc_matches_brick(const hostbuf& field, const std::vector& stride, size_t dist, const hipXtDesc* desc, const std::vector& bricks, size_t elem_size, const char* dir) { // construct field stride that includes batch distance too, since // brick coordinates include it auto field_stride_cm = stride; std::reverse(field_stride_cm.begin(), field_stride_cm.end()); field_stride_cm.push_back(dist); std::atomic compare_err = false; std::atomic runtime_err = false; std::vector brick_hosts; brick_hosts.resize(bricks.size()); #ifdef _OPENMP #pragma omp parallel for num_threads(rocfft_concurrency()) #endif for(size_t i = 0; i < bricks.size(); ++i) { // copy the ith brick back to host memory rocfft_scoped_device device(desc->GPUs[i]); hostbuf& brick_host = brick_hosts[i]; brick_host.alloc(desc->size[i]); if(hipMemcpy(brick_host.data(), desc->data[i], brick_host.size(), hipMemcpyDeviceToHost) != hipSuccess) { runtime_err = true; continue; } // convert to row-major auto brick_length_rm = bricks[i].length(); std::reverse(brick_length_rm.begin(), brick_length_rm.end()); // start at brick origin auto brick_idx_rm = brick_length_rm; std::fill(brick_idx_rm.begin(), brick_idx_rm.end(), 0); do { auto brick_idx_cm = brick_idx_rm; std::reverse(brick_idx_cm.begin(), brick_idx_cm.end()); auto field_offset = bricks[i].field_offset(brick_idx_cm, field_stride_cm); auto brick_offset = bricks[i].brick_offset(brick_idx_cm); if(memcmp(brick_host.data_offset(brick_offset * elem_size), field.data_offset(field_offset * elem_size), elem_size) != 0) { compare_err = true; break; } } while(increment_rowmajor(brick_idx_rm, brick_length_rm)); } if(runtime_err) throw std::runtime_error("failed to memcpy brick back to host"); return !compare_err; } // call the hipFFT APIs to distribute data to multiple GPUs void multi_gpu_prepare(std::vector& ibuffer, std::vector& pibuffer, std::vector& pobuffer) override { if(multiGPU <= 1) return; // input data is on the device - copy it back to the host so // hipfftXtMemcpy can deal with it hostbuf input_host; input_host.alloc(ibuffer.front().size()); if(hipMemcpy(input_host.data(), ibuffer.front().data(), ibuffer.front().size(), hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("copy back to host failed"); // allocate data on the multiple GPUs if(placement == fft_placement_inplace) { hipLibXtDesc* xt_tmp = nullptr; if(hipfftXtMalloc(plan, &xt_tmp, HIPFFT_XT_FORMAT_INPLACE) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed"); xt_output.reset(xt_tmp); xt_tmp = nullptr; if(hipfftXtMemcpy(plan, xt_output.get(), input_host.data(), HIPFFT_COPY_HOST_TO_DEVICE) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed"); pibuffer.clear(); std::copy_n(xt_output->descriptor->data, xt_output->descriptor->nGPUs, std::back_inserter(pibuffer)); pobuffer.clear(); } else { hipLibXtDesc* xt_tmp = nullptr; if(hipfftXtMalloc(plan, &xt_tmp, HIPFFT_XT_FORMAT_INPUT) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed"); xt_input.reset(xt_tmp); xt_tmp = nullptr; if(hipfftXtMemcpy(plan, xt_input.get(), input_host.data(), HIPFFT_COPY_HOST_TO_DEVICE) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed"); if(hipfftXtMalloc(plan, &xt_tmp, HIPFFT_XT_FORMAT_OUTPUT) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed"); xt_output.reset(xt_tmp); xt_tmp = nullptr; pibuffer.clear(); std::copy_n(xt_input->descriptor->data, xt_input->descriptor->nGPUs, std::back_inserter(pibuffer)); pobuffer.clear(); std::copy_n(xt_output->descriptor->data, xt_output->descriptor->nGPUs, std::back_inserter(pobuffer)); } // create bricks for this transform so we can confirm data layout hipLibXtDesc* compare_desc = placement == fft_placement_inplace ? xt_output.get() : xt_input.get(); xt_inBricks.resize(compare_desc->descriptor->nGPUs); xt_outBricks.resize(compare_desc->descriptor->nGPUs); set_io_bricks(ilength_cm(), olength_cm(), nbatch, xt_inBricks, xt_outBricks); // check cufftXtMemcpy versus hipfft's implementation if(!xt_desc_matches_brick(input_host, istride, idist, compare_desc->descriptor, xt_inBricks, var_size(precision, itype), "input")) throw std::runtime_error("Xt input does not match"); } // call the hipFFT APIs to gather the data back from the multiple GPUs virtual void multi_gpu_finalize(std::vector& obuffer, std::vector& pobuffer) override { if(multiGPU <= 1) return; // allocate a host buffer for hipFFTXtMemcpy's sake hostbuf output_host; output_host.alloc(obuffer.front().size()); if(hipfftXtMemcpy(plan, output_host.data(), xt_output.get(), HIPFFT_COPY_DEVICE_TO_HOST) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed"); // check cufftXtMemcpy versus hipfft's implementation if(placement == fft_placement_notinplace) { if(!xt_desc_matches_brick(output_host, ostride, odist, xt_output->descriptor, xt_outBricks, var_size(precision, otype), "output")) throw std::runtime_error("Xt output does not match"); } // copy final result back to device for comparison if(hipMemcpy(obuffer.front().data(), output_host.data(), obuffer.front().size(), hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("finalizing hipMemcpy failed"); pobuffer.clear(); pobuffer.push_back(obuffer.front().data()); } private: // hipFFT provides multiple ways to create FFT plans: // - hipfftPlan1d/2d/3d (combined allocate + init for specific dim) // - hipfftPlanMany (combined allocate + init with dim as param) // - hipfftCreate + hipfftMakePlan1d/2d/3d (separate alloc + init for specific dim) // - hipfftCreate + hipfftMakePlanMany (separate alloc + init with dim as param) // - hipfftCreate + hipfftMakePlanMany64 (separate alloc + init with dim as param, 64-bit) // - hipfftCreate + hipfftXtMakePlanMany (separate alloc + init with separate i/o/exec types) // // Rotate through the choices for better test coverage. enum PlanCreateAPI { PLAN_Nd, PLAN_MANY, CREATE_MAKE_PLAN_Nd, CREATE_MAKE_PLAN_MANY, CREATE_MAKE_PLAN_MANY64, CREATE_XT_MAKE_PLAN_MANY, }; // return true if we need to use hipFFT APIs that separate plan // allocation and plan init bool need_separate_create_make() const { // scale factor and multi-GPU need API calls between create + // init if(scale_factor != 1.0 || multiGPU > 1 || mp_lib != fft_mp_lib_none) return true; return false; } // Not all plan options work with all creation types. Return a // suitable plan creation type for the current FFT parameters. int get_create_type() { bool contiguous = is_contiguous(); bool batched = nbatch > 1; std::vector allowed_apis; // half-precision requires XtMakePlanMany if(precision == fft_precision_half) { allowed_apis.push_back(CREATE_XT_MAKE_PLAN_MANY); } else { // separate alloc + init "Many" APIs are always allowed allowed_apis.push_back(CREATE_MAKE_PLAN_MANY); allowed_apis.push_back(CREATE_MAKE_PLAN_MANY64); allowed_apis.push_back(CREATE_XT_MAKE_PLAN_MANY); if(!need_separate_create_make()) allowed_apis.push_back(PLAN_MANY); // non-many APIs are only allowed if FFT is contiguous, and // only the 1D API allows for batched FFTs. if(contiguous && (!batched || dim() == 1)) { if(!need_separate_create_make()) allowed_apis.push_back(PLAN_Nd); allowed_apis.push_back(CREATE_MAKE_PLAN_Nd); } } // hash the token to decide how to create this FFT. we want // test cases to rotate between different create APIs, but we // also need the choice of API to be stable across reruns of // the same test cases. return allowed_apis[std::hash()(token()) % allowed_apis.size()]; } // call hipfftPlan* functions hipfftResult_t create_plan_Nd() { auto ret = HIPFFT_INVALID_PLAN; switch(dim()) { case 1: ret = hipfftPlan1d(&plan, int_length[0], *hipfft_transform_type, nbatch); break; case 2: ret = hipfftPlan2d(&plan, int_length[0], int_length[1], *hipfft_transform_type); break; case 3: ret = hipfftPlan3d( &plan, int_length[0], int_length[1], int_length[2], *hipfft_transform_type); break; default: throw std::runtime_error("invalid dim"); } return ret; } hipfftResult_t create_plan_many() { auto ret = hipfftPlanMany(&plan, dim(), int_length.data(), int_inembed.data(), istride.back(), idist, int_onembed.data(), ostride.back(), odist, *hipfft_transform_type, nbatch); return ret; } // call hipfftCreate + hipfftMake* functions, inserting calls to // relevant pre-Make APIs (scale factor, XtSetGPUs) hipfftResult_t create_with_pre_make() { auto ret = hipfftCreate(&plan); if(ret != HIPFFT_SUCCESS) return ret; if(scale_factor != 1.0) { ret = hipfftExtPlanScaleFactor(plan, scale_factor); if(ret != HIPFFT_SUCCESS) return ret; } if(multiGPU > 1) { int deviceCount = 0; (void)hipGetDeviceCount(&deviceCount); // ensure that users request less than or equal to the total number of devices if(static_cast(multiGPU) > deviceCount) throw std::runtime_error("not enough devices for requested multi-gpu computation!"); std::vector GPUs(multiGPU); std::iota(GPUs.begin(), GPUs.end(), 0); ret = hipfftXtSetGPUs(plan, static_cast(multiGPU), GPUs.data()); xt_worksize.resize(GPUs.size()); workbuffersize_ptr = xt_worksize.data(); } if(mp_lib == fft_mp_lib_mpi) { #ifdef HIPFFT_MPI_ENABLE ret = hipfftMpAttachComm(plan, HIPFFT_COMM_MPI, mp_comm); if(ret != HIPFFT_SUCCESS) return ret; int mpi_rank = 0; MPI_Comm_rank(*static_cast(mp_comm), &mpi_rank); const auto& in_bricks = ifields.front().bricks; const auto& out_bricks = ofields.front().bricks; // find the input/output brick for this rank auto curr_rank_brick = [mpi_rank](const fft_brick& b) { return b.rank == mpi_rank; }; auto in_brick = std::find_if(in_bricks.begin(), in_bricks.end(), curr_rank_brick); auto out_brick = std::find_if(out_bricks.begin(), out_bricks.end(), curr_rank_brick); if(in_brick != in_bricks.end() && out_brick != out_bricks.end()) { std::vector input_lower; std::vector input_upper; std::vector output_lower; std::vector output_upper; std::vector input_stride; std::vector output_stride; // convert brick info to long long int for hipFFT auto convert_intvec = [](const std::vector& in, std::vector& out) { // start with index 1 because hipFFT only wants to be // told about FFT dimensions, not batch dimension for(size_t i = 1; i < in.size(); ++i) out.push_back(static_cast(in[i])); }; convert_intvec(in_brick->lower, input_lower); convert_intvec(in_brick->upper, input_upper); convert_intvec(out_brick->lower, output_lower); convert_intvec(out_brick->upper, output_upper); convert_intvec(in_brick->stride, input_stride); convert_intvec(out_brick->stride, output_stride); ret = hipfftXtSetDistribution(plan, static_cast(dim()), input_lower.data(), input_upper.data(), output_lower.data(), output_upper.data(), input_stride.data(), output_stride.data()); } #else throw std::runtime_error("MPI is not enabled"); #endif } return ret; } hipfftResult_t create_make_plan_Nd() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; switch(dim()) { case 1: return hipfftMakePlan1d( plan, int_length[0], *hipfft_transform_type, nbatch, workbuffersize_ptr); case 2: return hipfftMakePlan2d( plan, int_length[0], int_length[1], *hipfft_transform_type, workbuffersize_ptr); case 3: return hipfftMakePlan3d(plan, int_length[0], int_length[1], int_length[2], *hipfft_transform_type, workbuffersize_ptr); default: throw std::runtime_error("invalid dim"); } } hipfftResult_t create_make_plan_many() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; return hipfftMakePlanMany(plan, dim(), int_length.data(), int_inembed.data(), istride.back(), idist, int_onembed.data(), ostride.back(), odist, *hipfft_transform_type, nbatch, workbuffersize_ptr); } hipfftResult_t create_make_plan_many64() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; return hipfftMakePlanMany64(plan, dim(), ll_length.data(), ll_inembed.data(), istride.back(), idist, ll_onembed.data(), ostride.back(), odist, *hipfft_transform_type, nbatch, workbuffersize_ptr); } hipfftResult_t create_xt_make_plan_many() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; // execution type is always complex, matching the precision // of the transform // Initializing as double by default hipDataType executionType = HIP_C_64F; switch(precision) { case fft_precision_half: executionType = HIP_C_16F; break; case fft_precision_single: executionType = HIP_C_32F; break; case fft_precision_double: executionType = HIP_C_64F; break; } return hipfftXtMakePlanMany(plan, dim(), ll_length.data(), ll_inembed.data(), istride.back(), idist, inputType, ll_onembed.data(), ostride.back(), odist, outputType, nbatch, workbuffersize_ptr, executionType); } }; #endif hipFFT-rocm-6.4.3/clients/samples/000077500000000000000000000000001501537340400167115ustar00rootroot00000000000000hipFFT-rocm-6.4.3/clients/samples/CMakeLists.txt000066400000000000000000000112421501537340400214510ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# CMAKE_MINIMUM_REQUIRED( VERSION 3.16 ) project( hipfft-clients-samples-rocfft LANGUAGES CXX ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) if( NOT TARGET hipfft ) find_package( hipfft REQUIRED CONFIG PATHS ) endif( ) set( sample_list hipfft_1d_z2z hipfft_1d_d2z hipfft_2d_z2z hipfft_2d_d2z hipfft_3d_z2z hipfft_3d_d2z hipfft_planmany_2d_z2z hipfft_planmany_2d_r2c hipfft_multigpu_2d_z2z hipfft_setworkarea ) # callback sample has its own HIP code, so it needs to be built with hipcc or clang++ if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) # on cuFFT backend, use of callbacks requires linking against the # static cuFFT library if( NOT (BUILD_WITH_LIB STREQUAL "CUDA") OR NOT BUILD_SHARED_LIBS ) list( APPEND sample_list hipfft_callback ) else() message( STATUS "hipfft_callback sample disabled on non-static CUDA build" ) endif() else() message( STATUS "hipfft_callback sample disabled, requires hipcc or Clang++ build" ) endif() foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) target_link_libraries( ${sample} PRIVATE hip::hipfft ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP MODULE REQUIRED ) endif() if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) target_link_libraries( ${sample} PRIVATE hip::host hip::device ) else() target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVIDIA__) target_include_directories( ${sample} PRIVATE ${HIP_INCLUDE_DIRS}) endif() endif() if ( BUILD_WITH_LIB STREQUAL "CUDA" ) if( CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+$" ) target_compile_options( ${sample} PRIVATE -cuda -Xptxas=-w) target_link_options( ${sample} PRIVATE -cuda) else() target_compile_options( ${sample} PRIVATE -arch sm_53 -gencode=arch=compute_53,code=sm_53 -Xptxas=-w) endif() target_link_libraries( ${sample} PRIVATE ${CUDA_LIBRARIES} ) else() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() if ( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) endif() endif() target_include_directories( ${sample} PRIVATE $ $ $ ${HIP_ROOT_DIR} ) set_target_properties( ${sample} PROPERTIES CXX_EXTENSIONS NO ) if( HIPFFT_BUILD_SCOPE ) set( SAMPLES_OUT_DIR "/../staging" ) elseif( HIPFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_OUT_DIR "/../bin" ) else() set( SAMPLES_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_OUT_DIR} ) set_target_properties( ${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_OUT_DIR} ) endforeach() # cuFFT callback code must be compiled with -dc to enable relocatable # device code if( BUILD_WITH_LIB STREQUAL "CUDA" AND hipfft_callback IN_LIST sample_list ) target_compile_options( hipfft_callback PRIVATE -dc ) endif() hipFFT-rocm-6.4.3/clients/samples/hipfft_1d_d2z.cpp000066400000000000000000000074131501537340400220450ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 1D double-precision real-to-complex transform\n"; const size_t Nx = 8; const size_t Ncomplex = Nx / 2 + 1; std::vector rdata(Nx); size_t real_bytes = sizeof(decltype(rdata)::value_type) * rdata.size(); std::vector> cdata(Ncomplex); size_t complex_bytes = sizeof(std::complex) * cdata.size(); // Create HIP device object double* x; hipError_t hip_rt; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data for(size_t i = 0; i < Nx; i++) { rdata[i] = i; } std::cout << "input:\n"; for(size_t i = 0; i < rdata.size(); i++) { std::cout << rdata[i] << " "; } std::cout << std::endl; hip_rt = hipMemcpy(x, rdata.data(), real_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create the plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan1d(&plan, // plan handle Nx, // transform length HIPFFT_D2Z, // transform type (HIPFFT_R2C for single-precision) 1); // number of transforms (deprecated) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan1d failed"); // Execute plan: // hipfftExecD2Z: double precision, hipfftExecR2C: for single-precision // Direction is implied by real-to-complex direction hipfft_rt = hipfftExecD2Z(plan, x, (hipfftDoubleComplex*)x); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecD2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < cdata.size(); i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_1d_z2z.cpp000066400000000000000000000072251501537340400220740ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 1D double-precision complex-to-complex transform\n"; const int Nx = 8; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector> cdata(Nx); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device // Use hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex* x; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data for(size_t i = 0; i < Nx; i++) { cdata[i] = i; } std::cout << "input:\n"; for(size_t i = 0; i < cdata.size(); i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create the plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan1d(&plan, // plan handle Nx, // transform length HIPFFT_Z2Z, // transform type (HIPFFT_C2C for single-precision) 1); // number of transforms if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan1d failed"); // Execute plan: // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < cdata.size(); i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_2d_d2z.cpp000066400000000000000000000100261501537340400220400ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 2D double-precision real-to-complex transform\n"; const size_t Nx = 4; const size_t Ny = 5; std::cout << "Nx: " << Nx << "\tNy: " << Ny << std::endl; const size_t Nycomplex = Ny / 2 + 1; const size_t rstride = Nycomplex * 2; // Ny for out-of-place std::cout << "Input:\n"; std::vector rdata(Nx * rstride); for(size_t i = 0; i < Nx * rstride; i++) { rdata[i] = i; } for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * rstride + j; std::cout << rdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; double* x; hipError_t hip_rt; hip_rt = hipMalloc(&x, rdata.size() * sizeof(decltype(rdata)::value_type)); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy( x, rdata.data(), rdata.size() * sizeof(decltype(rdata)::value_type), hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan: hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan2d(&plan, // plan handle Nx, // transform length Ny, // transform length HIPFFT_D2Z); // transform type (HIPFFT_R2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlandd failed"); // Execute plan: // hipfftExecD2Z: double precision. hipfftExecR2C: single-precision hipfft_rt = hipfftExecD2Z(plan, x, (hipfftDoubleComplex*)x); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecD2Z failed"); // Copy the output data to the host: std::vector> cdata(Nx * Nycomplex); hip_rt = hipMemcpy( cdata.data(), x, cdata.size() * sizeof(decltype(cdata)::value_type), hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "Output:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Nycomplex; j++) { auto pos = i * Nycomplex + j; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_2d_z2z.cpp000066400000000000000000000076241501537340400221000ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 2D double-precision complex-to-complex transform\n"; const int Nx = 4; const int Ny = 4; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector> cdata(Nx * Ny); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device: // hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex* x; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data for(size_t i = 0; i < Nx * Ny; i++) { cdata[i] = i; } std::cout << "input:\n"; for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { int pos = i * Ny + j; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan2d(&plan, // plan handle Nx, // transform length Ny, // transform length HIPFFT_Z2Z); // transform type (HIPFFT_C2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlandd failed"); // Execute plan // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * Ny + j; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_3d_d2z.cpp000066400000000000000000000105061501537340400220440ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 3D double-precision real-to-complex transform\n"; const size_t Nx = 4; const size_t Ny = 5; const size_t Nz = 6; std::cout << "Nx: " << Nx << "\tNy " << Ny << "\tNz " << Nz << std::endl; const size_t Nzcomplex = Nz / 2 + 1; const size_t rstride = Nzcomplex * 2; // Nz for out-of-place const size_t real_bytes = sizeof(double) * Nx * Ny * rstride; const size_t complex_bytes = 2 * sizeof(double) * Nx * Ny * Nzcomplex; double* x; hipError_t hip_rt; hip_rt = hipMalloc(&x, real_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data std::vector rdata(Nx * Ny * rstride); for(size_t i = 0; i < Nx * Ny * rstride; i++) { rdata[i] = i; } std::cout << "input:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { for(size_t k = 0; k < rstride; k++) { auto pos = (i * Ny + j) * rstride + k; std::cout << rdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipMemcpy(x, rdata.data(), real_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan: hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan3d(&plan, // plan handle Nx, Ny, Nz, // transform lengths HIPFFT_D2Z); // transform type (HIPFFT_R2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan3d failed"); // Execute plan: // hipfftExecD2Z: double precision, hipfftExecR2C: single-precision hipfft_rt = hipfftExecD2Z(plan, x, (hipfftDoubleComplex*)x); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecD2Z failed"); std::cout << "output:\n"; std::vector> cdata(Nx * Ny * Nz); hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { for(size_t k = 0; k < Nzcomplex; k++) { auto pos = (i * Ny + j) * Nzcomplex + k; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_3d_z2z.cpp000066400000000000000000000102711501537340400220710ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 3D double-precision complex-to-complex transform\n"; const int Nx = 4; const int Ny = 4; const int Nz = 4; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector> cdata(Nx * Ny * Nz); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device: // hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex* x; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); std::cout << "Input:\n"; for(size_t i = 0; i < Nx * Ny * Nz; i++) { cdata[i] = i; } for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { for(int k = 0; k < Nz; k++) { int pos = (i * Ny + j) * Nz + k; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan3d(&plan, // plan handle Nx, // transform length Ny, // transform length Nz, // transform length HIPFFT_Z2Z); // transform type (HIPFFT_C2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan3d failed"); // Execute plan // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { for(int k = 0; k < Nz; k++) { int pos = (i * Ny + j) * Nz + k; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_callback.cpp000066400000000000000000000136161501537340400225200ustar00rootroot00000000000000// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include "../hipfft_params.h" struct load_cbdata { hipfftDoubleComplex* filter; double scale; }; __device__ hipfftDoubleComplex load_callback(hipfftDoubleComplex* input, size_t offset, void* cbdata, void* sharedMem) { auto data = static_cast(cbdata); // NB: for optimal performance, one may need a custom // multiplication operator. return hipCmul(hipCmul(input[offset], data->filter[offset]), make_hipDoubleComplex(data->scale, 0)); } __device__ auto load_callback_dev = load_callback; int main() { std::cout << "hipfft 1D double-precision complex-to-complex transform with callback\n"; const int Nx = 8; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector cdata(Nx), filter(Nx); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device // Use hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex *x, *filter_dev; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMalloc(&filter_dev, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Initialize the data and filter for(size_t i = 0; i < Nx; i++) { cdata[i].x = i; cdata[i].y = i; filter[i].x = rand() / static_cast(RAND_MAX); filter[i].y = 0; } hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); hip_rt = hipMemcpy(filter_dev, filter.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "input:\n"; for(size_t i = 0; i < cdata.size(); i++) { std::cout << "(" << cdata[i].x << ", " << cdata[i].y << ") "; } std::cout << std::endl; // Create the plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan1d(&plan, // plan handle Nx, // transform length HIPFFT_Z2Z, // transform type (HIPFFT_C2C for single-precision) 1); // number of transforms if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan1d failed"); // prepare callback load_cbdata cbdata_host; cbdata_host.filter = filter_dev; cbdata_host.scale = 1.0 / static_cast(Nx); void* cbdata_dev; hip_rt = hipMalloc(&cbdata_dev, sizeof(load_cbdata)); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); void* cbptr_host = nullptr; hip_rt = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpyFromSymbol failed"); // set callback hipfft_rt = hipfftXtSetCallback(plan, &cbptr_host, HIPFFT_CB_LD_COMPLEX_DOUBLE, &cbdata_dev); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtSetCallback failed"); // Execute plan: // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < cdata.size(); i++) { std::cout << "(" << cdata[i].x << ", " << cdata[i].y << ") "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(cbdata_dev); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); hip_rt = hipFree(filter_dev); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_multigpu_2d_z2z.cpp000066400000000000000000000127111501537340400240170ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "Multi-gpu hipFFT 2D double-precision complex-to-complex transform\n"; // 2D FFTs are encountered in diverse applications of image processing, // examples range from image denoising to RTM seismic imaging. // In this example we compare the 2D FFT computation using single vs multiple GPUs. // Note that when using cuFFTXt with two or more GPUs, its latest version requires // a minimum size per dimension greater or equal than 32 and less equal than 4096 // for single precision, and 2048 for double precision. const int Nx = 512; const int Ny = 512; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 int verbose = 0; // Initialize reference data std::vector> cinput(Nx * Ny); for(size_t i = 0; i < Nx * Ny; i++) { cinput[i] = i; } if(verbose) { std::cout << "Input:\n"; for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { int pos = i * Ny + j; std::cout << cinput[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; } // Define list of GPUs to use std::array gpus = {0, 1}; // Create the multi-gpu plan hipLibXtDesc* desc; // input descriptor hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; if(hipfftCreate(&plan) != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); // Create a GPU stream and assign it to the plan hipStream_t stream{}; if(hipStreamCreate(&stream) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); if(hipfftSetStream(plan, stream) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetStream failed."); // Assign GPUs to the plan hipfftResult hipfft_rt = hipfftXtSetGPUs(plan, gpus.size(), gpus.data()); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtSetGPUs failed."); // Make the 2D plan size_t workSize[gpus.size()]; hipfft_rt = hipfftMakePlan2d(plan, Nx, Ny, HIPFFT_Z2Z, workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftMakePlan2d failed."); // Copy input data to GPUs hipfftXtSubFormat_t format = HIPFFT_XT_FORMAT_INPLACE_SHUFFLED; hipfft_rt = hipfftXtMalloc(plan, &desc, format); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed."); hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(desc), reinterpret_cast(cinput.data()), HIPFFT_COPY_HOST_TO_DEVICE); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed."); // Execute the plan hipfft_rt = hipfftXtExecDescriptor(plan, desc, desc, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed."); // Print output if(verbose) { // Move result to the host hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(cinput.data()), reinterpret_cast(desc), HIPFFT_COPY_DEVICE_TO_HOST); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy D2H failed."); std::cout << "Output:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * Ny + j; std::cout << cinput[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; } // Clean up if(hipfftXtFree(desc) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtFree failed."); if(hipfftDestroy(plan) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftDestroy failed."); if(hipStreamDestroy(stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_planmany_2d_r2c.cpp000066400000000000000000000125731501537340400237370ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP int main() { std::cout << "hipfft 2D single-precision real-to-complex transform using " "advanced interface\n"; int rank = 2; int n[2] = {4, 5}; int howmany = 3; // batch size int n1_complex_elements = n[1] / 2 + 1; int n1_padding_real_elements = n1_complex_elements * 2; int istride = 1; int ostride = istride; int inembed[2] = {istride * n[0], istride * n1_padding_real_elements}; int onembed[2] = {ostride * n[0], ostride * n1_complex_elements}; int idist = inembed[0] * inembed[1]; int odist = onembed[0] * onembed[1]; std::cout << "n: " << n[0] << " " << n[1] << "\n" << "howmany: " << howmany << "\n" << "istride: " << istride << "\tostride: " << ostride << "\n" << "inembed: " << inembed[0] << " " << inembed[1] << "\n" << "onembed: " << onembed[0] << " " << onembed[1] << "\n" << "idist: " << idist << "\todist: " << odist << "\n" << std::endl; std::vector data(howmany * idist); const auto total_bytes = data.size() * sizeof(decltype(data)::value_type); std::cout << "input:\n"; std::fill(data.begin(), data.end(), 0.0); for(int ibatch = 0; ibatch < howmany; ++ibatch) { for(int i = 0; i < n[0]; i++) { for(int j = 0; j < n[1]; j++) { const auto pos = ibatch * idist + istride * (i * inembed[1] + j); data[pos] = i + ibatch + j; } } } for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < inembed[0]; i++) { for(int j = 0; j < inembed[1]; j++) { const auto pos = ibatch * idist + i * inembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftHandle hipForwardPlan; hipfftResult hipfft_rt; hipfft_rt = hipfftPlanMany(&hipForwardPlan, rank, n, inembed, istride, idist, onembed, ostride, odist, HIPFFT_R2C, // Use HIPFFT_D2Z for double-precsion. howmany); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfftReal* gpu_data; hipError_t hip_rt; hip_rt = hipMalloc((void**)&gpu_data, total_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy(gpu_data, (void*)data.data(), total_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); hipfft_rt = hipfftExecR2C(hipForwardPlan, gpu_data, (hipfftComplex*)gpu_data); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to execute plan"); hip_rt = hipMemcpy((void*)data.data(), gpu_data, total_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "output:\n"; const std::complex* output = (std::complex*)data.data(); for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < onembed[0]; i++) { for(int j = 0; j < onembed[1]; j++) { const auto pos = ibatch * odist + i * onembed[1] + j; std::cout << output[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(hipForwardPlan); hip_rt = hipFree(gpu_data); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_planmany_2d_z2z.cpp000066400000000000000000000117361501537340400237760ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP int main() { std::cout << "hipfft 2D double-precision complex-to-complex transform using " "advanced interface\n"; int rank = 2; int n[2] = {4, 5}; int howmany = 3; // array is contiguous in memory int istride = 1; // in-place transforms require istride=ostride int ostride = istride; // we choose to have no padding around our data: int inembed[2] = {istride * n[0], istride * n[1]}; // in-place transforms require inembed=oneembed: int onembed[2] = {inembed[0], inembed[1]}; int idist = inembed[0] * inembed[1]; int odist = onembed[0] * onembed[1]; std::cout << "n: " << n[0] << " " << n[1] << "\n" << "howmany: " << howmany << "\n" << "istride: " << istride << "\tostride: " << ostride << "\n" << "inembed: " << inembed[0] << " " << inembed[1] << "\n" << "onembed: " << onembed[0] << " " << onembed[1] << "\n" << "idist: " << idist << "\todist: " << odist << "\n" << std::endl; std::vector> data(howmany * idist); const auto total_bytes = data.size() * sizeof(decltype(data)::value_type); std::cout << "input:\n"; std::fill(data.begin(), data.end(), 0.0); for(int ibatch = 0; ibatch < howmany; ++ibatch) { for(int i = 0; i < n[0]; i++) { for(int j = 0; j < n[1]; j++) { const auto pos = ibatch * idist + istride * (i * inembed[1] + j); data[pos] = std::complex(i + ibatch, j); } } } for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < inembed[0]; i++) { for(int j = 0; j < inembed[1]; j++) { const auto pos = ibatch * idist + i * inembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftHandle hipPlan; hipfftResult hipfft_rt; hipfft_rt = hipfftPlanMany( &hipPlan, rank, n, inembed, istride, idist, onembed, ostride, odist, HIPFFT_Z2Z, howmany); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipError_t hip_rt; hipfftDoubleComplex* d_in_out; hip_rt = hipMalloc((void**)&d_in_out, total_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy(d_in_out, (void*)data.data(), total_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); hipfft_rt = hipfftExecZ2Z(hipPlan, d_in_out, d_in_out, HIPFFT_FORWARD); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to execute plan"); hip_rt = hipMemcpy((void*)data.data(), d_in_out, total_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "output:\n"; for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < onembed[0]; i++) { for(int j = 0; j < onembed[1]; j++) { const auto pos = ibatch * odist + i * onembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipFree(d_in_out); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/samples/hipfft_setworkarea.cpp000066400000000000000000000113751501537340400233130ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 1D single-precision real-to-complex transform showing " "work memory usage\n"; int major_version; hipfftGetProperty(HIPFFT_MAJOR_VERSION, &major_version); std::cout << "hipFFT major_version " << major_version << std::endl; const size_t N = 9; const size_t Ncomplex = (N / 2 + 1); std::vector rdata(N); std::vector> cdata(Ncomplex); size_t real_bytes = sizeof(decltype(rdata)::value_type) * rdata.size(); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); hipError_t hip_rt = hipSuccess; hipfftResult hipfft_rt = HIPFFT_SUCCESS; std::cout << "input:\n"; for(size_t i = 0; i < N; i++) { rdata[i] = i; } for(size_t i = 0; i < N; i++) { std::cout << rdata[i] << " "; } std::cout << std::endl; // Create HIP device object. hipfftReal* x; hip_rt = hipMalloc(&x, real_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hipfftComplex* y; hip_rt = hipMalloc(&y, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Copy input data to device hip_rt = hipMemcpy(x, rdata.data(), real_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); size_t workSize; hipfft_rt = hipfftEstimate1d(N, HIPFFT_R2C, 1, &workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftEstimate1d failed"); std::cout << "hipfftEstimate 1d workSize: " << workSize << std::endl; hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftCreate failed"); hipfft_rt = hipfftSetAutoAllocation(plan, 0); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetAutoAllocation failed"); hipfft_rt = hipfftMakePlan1d(plan, N, HIPFFT_R2C, 1, &workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftMakePlan1d failed"); // Set work buffer hipfftComplex* workBuf; hip_rt = hipMalloc(&workBuf, workSize); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hipfft_rt = hipfftSetWorkArea(plan, workBuf); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetWorkArea failed"); hipfft_rt = hipfftGetSize(plan, &workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftGetSize failed"); std::cout << "hipfftGetSize workSize: " << workSize << std::endl; // Execute plan hipfft_rt = hipfftExecR2C(plan, x, (hipfftComplex*)y); // Copy result back to host hip_rt = hipMemcpy(cdata.data(), y, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "output:\n"; for(size_t i = 0; i < Ncomplex; i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); hip_rt = hipFree(workBuf); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-6.4.3/clients/tests/000077500000000000000000000000001501537340400164075ustar00rootroot00000000000000hipFFT-rocm-6.4.3/clients/tests/CMakeLists.txt000066400000000000000000000170241501537340400211530ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( hipfft-clients-tests LANGUAGES CXX ) find_package( Boost REQUIRED) set( Boost_USE_STATIC_LIBS OFF ) find_package( FFTW 3.0 REQUIRED MODULE COMPONENTS FLOAT DOUBLE ) set( THREADS_PREFER_PTHREAD_FLAG ON ) find_package( Threads REQUIRED ) set( hipfft-test_source gtest_main.cpp hipfft_accuracy_test.cpp simple_test.cpp accuracy_test_1D.cpp accuracy_test_2D.cpp accuracy_test_3D.cpp accuracy_test_callback.cpp multi_device_test.cpp ../../shared/array_validator.cpp ) add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} ) set( TEST_TARGETS hipfft-test ) # MPI worker for MPI tests if( HIPFFT_MPI_ENABLE ) # build MPI worker to support the tests add_executable( hipfft_mpi_worker hipfft_mpi_worker.cpp ) list( APPEND TEST_TARGETS hipfft_mpi_worker ) target_include_directories( hipfft_mpi_worker PRIVATE ${MPI_C_INCLUDE_PATH} ) add_compile_definitions( HIPFFT_MPI_ENABLE ) endif() if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP MODULE REQUIRED ) endif() endif() if( HIPFFT_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../staging" ) elseif( HIPFFT_CLIENTS_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../bin" ) else() set( TESTS_OUT_DIR "/bin" ) endif() string( CONCAT TESTS_OUT_DIR "${PROJECT_BINARY_DIR}" ${TESTS_OUT_DIR} ) option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) foreach( target ${TEST_TARGETS} ) set_target_properties( ${target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( BUILD_WITH_LIB STREQUAL "ROCM" ) target_compile_options( ${target} PRIVATE ${WARNING_FLAGS} ) target_link_libraries( ${target} PRIVATE hip::host hip::device ) foreach( gpu_target ${AMDGPU_TARGETS} ) target_compile_options( ${target} PRIVATE --offload-arch=${gpu_target} ) endforeach() if( NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() if( USE_HIPRAND ) target_link_libraries( ${target} PRIVATE hip::hiprand ) endif() else() target_compile_definitions( ${target} PRIVATE __HIP_PLATFORM_NVIDIA__) target_include_directories( ${target} PRIVATE ${HIP_INCLUDE_DIRS}) if( CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+$" ) target_compile_options( ${target} PRIVATE -cuda -Xptxas=-w) target_link_options( ${target} PRIVATE -cuda) else() target_compile_options( ${target} PRIVATE -arch sm_53 -gencode=arch=compute_53,code=sm_53 -Xptxas=-w) endif() if( NVHPC_FOUND ) target_link_libraries( ${target} PRIVATE NVHPC::CUDART ) else() target_link_libraries( ${target} PRIVATE CUDA::cudart ) endif() target_compile_definitions( ${target} PUBLIC _CUFFT_BACKEND ) endif() if( BUILD_CLIENTS_TESTS_OPENMP ) if( BUILD_WITH_LIB STREQUAL "CUDA" ) message( STATUS "OpenMP is not supported on CUDA, building tests without it" ) else() target_compile_options( ${target} PRIVATE -DBUILD_CLIENTS_TESTS_OPENMP ) if(NOT (CMAKE_CXX_COMPILER MATCHES ".*hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")) target_compile_options( ${target} PRIVATE -fopenmp ) target_link_libraries( ${target} PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib ) target_include_directories( ${target} PRIVATE ${HIP_CLANG_ROOT}/include ) else() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options( ${target} PRIVATE -fopenmp=libomp ) target_link_options( ${target} PRIVATE -fopenmp=libomp ) endif() endif() endif() endif() target_include_directories( ${target} PRIVATE $ $ $ $ ) target_link_libraries( ${target} PRIVATE hip::hipfft ${FFTW_LIBRARIES} ) if( HIPFFT_MPI_ENABLE ) target_link_libraries( ${target} PRIVATE MPI::MPI_CXX ) endif() set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS ${target} COMPONENT tests) endforeach() if( GTEST_FOUND ) target_include_directories( hipfft-test PRIVATE $ ) target_link_libraries( hipfft-test PRIVATE ${GTEST_LIBRARIES} ) else() add_dependencies( hipfft-test gtest ) target_include_directories( hipfft-test PRIVATE hipfft-test_include_dirs ${GTEST_INCLUDE_DIRS} ) target_link_libraries( hipfft-test PRIVATE ${GTEST_LIBRARIES} ) endif() if(FFTW_MULTITHREAD) target_compile_options( hipfft-test PRIVATE -DFFTW_MULTITHREAD ) endif( ) target_link_libraries( hipfft-test PRIVATE Threads::Threads ) if (WIN32) # Ensure tests run with HIP DLLs and not anything the driver owns # in system32. Libraries like amdhip64.dll are also in the HIP # runtime, and we need run with those. But the only way to make a # same-named DLL override something in system32 is to have it next # to the executable. So copy them in. file( GLOB third_party_dlls LIST_DIRECTORIES OFF CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll C:/Windows/System32/libomp140*.dll ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET hipfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ ) endforeach( file_i ) endif() hipFFT-rocm-6.4.3/clients/tests/accuracy_test_1D.cpp000066400000000000000000000731611501537340400223000ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/fft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; // TODO: handle special case where length=2 for real/complex transforms. const static std::vector pow2_range = {2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912}; // 2^30 is 1073741824; const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; const static std::vector pow3_range = {3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, 1594323, 4782969, 14348907, 43046721, 129140163, 387420489}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625, 78125, 390625, 1953125, 9765625, 48828125, 244140625}; // radix 7, 11, 13 sizes that are either pure powers or sizes people have wanted in the wild const static std::vector radX_range = {7, 49, 84, 112, 11, 13, 52, 104, 208, 343, 2401, 16807}; const static std::vector mix_range = {6, 10, 12, 15, 20, 30, 56, 120, 150, 225, 240, 300, 336, 486, 600, 900, 1250, 1500, 1875, 2160, 2187, 2250, 2500, 3000, 4000, 12000, 24000, 72000}; const static std::vector prime_range = {17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; static std::vector small_1D_sizes() { static const size_t SMALL_1D_MAX = 8192; // generate a list of sizes from 2 and up, skipping any sizes that are already covered std::vector covered_sizes; std::copy(pow2_range.begin(), pow2_range.end(), std::back_inserter(covered_sizes)); std::copy(pow3_range.begin(), pow3_range.end(), std::back_inserter(covered_sizes)); std::copy(pow5_range.begin(), pow5_range.end(), std::back_inserter(covered_sizes)); std::copy(radX_range.begin(), radX_range.end(), std::back_inserter(covered_sizes)); std::copy(mix_range.begin(), mix_range.end(), std::back_inserter(covered_sizes)); std::copy(prime_range.begin(), prime_range.end(), std::back_inserter(covered_sizes)); std::sort(covered_sizes.begin(), covered_sizes.end()); std::vector output; for(size_t i = 2; i < SMALL_1D_MAX; ++i) { if(!std::binary_search(covered_sizes.begin(), covered_sizes.end(), i)) { output.push_back(i); } } return output; } const static std::vector> stride_range = {{1}}; const static std::vector batch_range_1D = {4, 2, 1}; const static std::vector> stride_range_for_prime = {{1}, {2}, {3}, {64}, {65}}; //TODO: this will be merged back to stride_range const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved INSTANTIATE_TEST_SUITE_P(small_1D, accuracy_test, ::testing::ValuesIn(param_generator_base( test_prob, {fft_transform_type_complex_forward}, generate_lengths({small_1D_sizes()}), {fft_precision_single}, {1}, [](fft_transform_type t, const std::vector& place_range, const bool planar) { return std::vector{ std::make_tuple(t, place_range[0], fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)}; }, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace})), accuracy_test::TestName); // NB: // We have known non-unit strides issues for 1D: // - C2C middle size(for instance, single precision, 8192) // - C2C large size(for instance, single precision, 524288) // We need to fix non-unit strides first, and then address non-unit strides + batch tests. // Then check these problems of R2C and C2R. After that, we could open arbitrary permutations in the // main tests. // // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels // invloved. const static std::vector pow2_range_for_stride = {4096, 8192, 524288}; const static std::vector pow2_range_for_stride_half = {4096, 8192}; const static std::vector> stride_range_for_pow2 = {{2}, {3}}; const static std::vector batch_range_for_stride = {2, 1}; INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, accuracy_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_half}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); // Create an array parameters for strided 2D batched transforms. inline auto param_generator_complex_1d_batched_2d(const double base_prob, const std::vector>& v_lengths, const std::vector& precision_range, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range) { std::vector params; // for(auto& transform_type : // {fft_transform_type_complex_forward, fft_transform_type_complex_inverse}) // { for(auto& transform_type : trans_type_range_complex) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { assert(false); continue; } for(const auto precision : precision_range) { for(const auto& types : generate_types(transform_type, place_range, false)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = lengths; param.ostride = lengths; param.nbatch = lengths[0]; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = 1; param.odist = 1; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = base_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } } } return params; } const static std::vector pow2_range_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; INSTANTIATE_TEST_SUITE_P( pow2_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow2_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector pow3_range_2D = {3, 27, 81, 243, 729, 2187, 6561}; INSTANTIATE_TEST_SUITE_P( pow3_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow3_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector pow5_range_2D = {5, 25, 125, 625, 3125, 15625}; INSTANTIATE_TEST_SUITE_P( pow5_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow5_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector prime_range_2D = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; INSTANTIATE_TEST_SUITE_P( prime_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({prime_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); hipFFT-rocm-6.4.3/clients/tests/accuracy_test_2D.cpp000066400000000000000000000410411501537340400222710ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/fft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; // Set parameters // TODO: enable 16384, 32768 when omp support is available (takes too // long!) const static std::vector pow2_range = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}; const static std::vector pow3_range = {3, 27, 81, 243, 729, 2187, 6561}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625}; const static std::vector prime_range = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; const static std::vector mix_range = {56, 120, 336, 2160, 5000, 6000, 8000}; const static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range, mix_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range, mix_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); hipFFT-rocm-6.4.3/clients/tests/accuracy_test_3D.cpp000066400000000000000000000313241501537340400222750ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/fft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; // Set parameters // TODO: 512, 1024, 2048 make the tests take too long; re-enable when // test speed is improved. static std::vector pow2_range = {4, 8, 16, 32, 128, 256}; static std::vector pow2_range_half = {4, 8, 16, 32}; // SBCC+SBRC as a sub-node of a 3D TRTRTR std::vector> pow2_adhoc = {{4, 4, 8192}}; static std::vector pow3_range = {3, 9, 27, 81, 243}; static std::vector pow5_range = {5, 25, 125}; static std::vector prime_range = {7, 11, 13, 17, 19, 23, 29}; static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P( pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range, pow2_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range, pow2_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range, pow3_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range, pow3_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range, pow5_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range, pow5_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range, prime_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow3_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow3_range, prime_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); // Test combinations of SBRC sizes, plus a non-SBRC size (10) to // exercise fused SBRC+transpose kernels. static std::vector sbrc_range = {50, 64, 81, 100, 200, 10, 128, 256}; static std::vector sbrc_batch_range = {2, 1}; INSTANTIATE_TEST_SUITE_P( sbrc_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({sbrc_range, sbrc_range, sbrc_range}), precision_range_sp_dp, sbrc_batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't static std::vector inner_batch_3D_range = {4, 8, 16, 32, 20, 24, 64}; static std::vector inner_batch_3D_batch_range = {3, 2, 1}; INSTANTIATE_TEST_SUITE_P( inner_batch_3D, accuracy_test, // TODO: enable for real as well, but currently real kernels have // trouble with weird strides ::testing::ValuesIn(param_generator_complex( test_prob, generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); hipFFT-rocm-6.4.3/clients/tests/accuracy_test_callback.cpp000066400000000000000000000141771501537340400235720ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" std::vector> callback_sizes = { // some single kernel sizes {4}, {16}, {81}, {100}, // L1D_TRTRT sizes {70}, {77}, {1344}, // L1D_CC sizes {8192}, {10000}, // prime {23}, {29}, // 2D_SINGLE sizes, small and big {16, 8}, {32, 32}, {9, 81}, {27, 81}, {81, 27}, {256, 9}, {9, 256}, {125, 32}, {32, 125}, // 2D_RTRT {20, 40}, {81, 81}, // 2D_RC {128, 64}, {128, 256}, // more complicated children of 2D_RTRT (L1D_TRTRT, L1D_CC, prime) {4, 63}, {63, 4}, {4, 8192}, {8192, 4}, {4, 23}, {23, 4}, // 3D_TRTRTR, with complicated children {63, 5, 6}, {6, 5, 63}, {23, 5, 6}, {6, 5, 23}, {70, 5, 6}, {6, 5, 70}, {8192, 5, 6}, {6, 5, 8192}, // 3D_RTRT, with complicated children {23, 4, 4}, {4, 4, 23}, {70, 4, 4}, {4, 4, 70}, {8192, 4, 4}, {4, 4, 8192}, // 3D odd lengths {27, 27, 27}, // 3D_BLOCK_RC {64, 64, 64}, }; const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; auto transform_types = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; #ifdef __HIP__ INSTANTIATE_TEST_SUITE_P(callback, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_callback, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, true)), accuracy_test::TestName); #endif // one of the obvious use cases for callbacks is to implement result // scaling manually, so use the same sizes to test rocFFT's own // result scaling feature. inline auto param_generator_scaling(const std::vector>& v_lengths) { auto params = param_generator(test_prob, callback_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false); for(auto& param : params) param.scale_factor = 7.23; return params; } // cuFFT does not support result scaling #ifndef _CUFFT_BACKEND INSTANTIATE_TEST_SUITE_P(scaling, accuracy_test, ::testing::ValuesIn(param_generator_scaling(callback_sizes)), accuracy_test::TestName); #endif hipFFT-rocm-6.4.3/clients/tests/gtest_main.cpp000066400000000000000000000553561501537340400212630ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /// @file /// @brief googletest based unit tester for hipfft /// #include #include #include #include #include #include #include #include #include "../../shared/CLI11.hpp" #include "../../shared/concurrency.h" #include "../../shared/device_properties.h" #include "../../shared/environment.h" #include "../../shared/hostbuf.h" #include "../../shared/sys_mem.h" #include "../../shared/work_queue.h" #include "../hipfft_params.h" #include "hipfft/hipfft.h" #include "hipfft_accuracy_test.h" #include "hipfft_test_params.h" // Control output verbosity: int verbose; // Run a short (~5 min) test suite by setting test_prob to an appropriate value bool smoketest = false; // User-defined random seed size_t random_seed; // Overall probability of running conventional tests double test_prob; // Probability of running tests from the emulation suite double emulation_prob; // Modifier for probability of running tests with complex interleaved data double complex_interleaved_prob_factor; // Modifier for probability of running tests with real data double real_prob_factor; // Modifier for probability of running tests with complex planar data double complex_planar_prob_factor; // Modifier for probability of running tests with callbacks double callback_prob_factor; // Transform parameters for manual test: fft_params manual_params; // Host memory limitation for tests (GiB): size_t ramgb; // Device memory limitation for tests (GiB): size_t vramgb; // Allow skipping tests if there is a runtime error bool skip_runtime_fails; // But count the number of failures int n_hip_failures = 0; // Manually specified precision cutoffs: double single_epsilon; double double_epsilon; double half_epsilon; // Measured precision cutoffs: double max_linf_eps_double = 0.0; double max_l2_eps_double = 0.0; double max_linf_eps_single = 0.0; double max_l2_eps_single = 0.0; double max_linf_eps_half = 0.0; double max_l2_eps_half = 0.0; // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE). bool use_fftw_wisdom = false; // Compare results against FFTW in accuracy tests bool fftw_compare = true; // Cache the last cpu fft that was requested last_cpu_fft_cache last_cpu_fft_data; // Multi-process library to use fft_params::fft_mp_lib mp_lib = fft_params::fft_mp_lib_none; // Number of multi-process ranks to launch int mp_ranks = 1; // Multi-process launch command (e.g. mpirun --np 4 /path/to/rocfft_mpi_worker) std::string mp_launch; void init_gtest_flags() { // HACK: gtest maintains a "should run" flag on each test case, // but only sets it during RUN_ALL_TESTS. Precompiling should // ideally only happen for the test cases that would actually // run. // // So call RUN_ALL_TESTS once with the "list tests" temporarily set // to true, to initialize all of that. // // gtest will then print all of the test cases to stdout. // Temporarily redirect stdout to /dev/null as well. bool temp_list_tests = true; std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); // move stdout to devnull #ifdef WIN32 int stdout_fd = _fileno(stdout); int devnull = _open("NUL", _O_WRONLY); int stdout_copy = _dup(stdout_fd); _dup2(devnull, stdout_fd); #else int stdout_fd = STDOUT_FILENO; int devnull = open("/dev/null", O_WRONLY); int stdout_copy = dup(stdout_fd); dup2(devnull, stdout_fd); #endif (void)RUN_ALL_TESTS(); // put stdout back #ifdef WIN32 _dup2(stdout_copy, stdout_fd); _close(stdout_copy); _close(devnull); #else dup2(stdout_copy, stdout_fd); close(stdout_copy); close(devnull); #endif std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); } void precompile_test_kernels(const std::string& precompile_file) { std::cout << "precompiling test kernels...\n"; WorkQueue tokenQueue; init_gtest_flags(); std::vector tokens; auto ut = testing::UnitTest::GetInstance(); for(int ts_index = 0; ts_index < ut->total_test_suite_count(); ++ts_index) { const auto ts = ut->GetTestSuite(ts_index); for(int ti_index = 0; ti_index < ts->total_test_count(); ++ti_index) { const auto ti = ts->GetTestInfo(ti_index); std::string name = ti->name(); // only precompile test cases that will run if(!ti->should_run()) continue; // only care about accuracy tests if(name.find("vs_fftw/") != std::string::npos) { name.erase(0, 8); // change batch to 1, so we don't waste time creating // multiple plans that differ only by batch auto idx = name.find("_batch_"); if(idx == std::string::npos) continue; // advance idx to batch number idx += 7; auto end = name.find('_', idx); if(end == std::string::npos) continue; name.replace(idx, end - idx, "1"); tokens.emplace_back(std::move(name)); } } } std::random_device dev; std::mt19937 dist(dev()); std::shuffle(tokens.begin(), tokens.end(), dist); auto precompile_begin = std::chrono::steady_clock::now(); std::cout << "precompiling " << tokens.size() << " FFT plans...\n"; for(auto&& t : tokens) tokenQueue.push(std::move(t)); EnvironmentSetTemp env_compile_only{"ROCFFT_INTERNAL_COMPILE_ONLY", "1"}; const size_t NUM_THREADS = rocfft_concurrency(); std::vector threads; for(size_t i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([&tokenQueue]() { for(;;) { std::string token{tokenQueue.pop()}; if(token.empty()) break; try { hipfft_params params; params.from_token(token); params.validate(); params.create_plan(); } catch(fft_params::work_buffer_alloc_failure&) { continue; } catch(std::exception& e) { // failed to create a plan, abort // // we could continue on, but the test should just // fail later anyway in the same way. so report // which token failed early and get out throw std::runtime_error(token + " plan creation failure: " + e.what()); } } }); // insert empty tokens to tell threads to stop tokenQueue.push({}); } for(auto& t : threads) t.join(); auto precompile_end = std::chrono::steady_clock::now(); std::chrono::duration precompile_ms = precompile_end - precompile_begin; std::cout << "done precompiling FFT plans in " << static_cast(precompile_ms.count()) << " ms\n"; } int main(int argc, char* argv[]) { CLI::App app{ "\n" "hipFFT Runtime Test command line options\n" "NB: input parameters are row-major.\n" "\n" "FFTW accuracy test cases are named using these identifiers:\n" "\n" " len_: problem dimensions, row-major\n" " single,double: precision\n" " ip,op: in-place or out-of-place\n" " batch_: batch size\n" " istride__: input stride (ostride for output stride), format may be:\n" " CI - complex interleaved\n" " CP - complex planar\n" " R - real\n" " HI - hermitian interleaved\n" " HP - hermitian planar\n" "\n" "Usage"}; // Override CLI11 help to print after later CLI11 options that are defined, and allow gtest's help app.set_help_flag(""); CLI::Option* opt_help = app.add_flag("-h, --help", "Produces this help message"); app.add_option("-v, --verbose", verbose, "Print out detailed information for the tests") ->default_val(0); app.add_option("--test_prob", test_prob, "Probability of running individual tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option( "--complex_interleaved_prob_factor", complex_interleaved_prob_factor, "Probability multiplier for running individual transforms with complex interleaved data") ->default_val(1) ->check(CLI::NonNegativeNumber); app.add_option("--callback_prob", callback_prob_factor, "Probability multiplier for running individual callback transforms") ->default_val(0.1) ->check(CLI::NonNegativeNumber); app.add_option("--fftw_compare", fftw_compare, "Compare to FFTW in accuracy tests") ->default_val(true); app.add_option("--mp_lib", mp_lib, "Multi-process library type: none (default), mpi") ->default_val("none"); app.add_option("--mp_ranks", mp_ranks, "Number of multi-process ranks to launch") ->default_val(1) ->check(CLI::NonNegativeNumber); app.add_option("--mp_launch", mp_launch, "Command line prefix to launch multi-process transforms, e.g. \"mpirun --np 4 " "/path/to/rocfft_mpi_worker\"") ->default_val("") ->each([&](const std::string&) { if(mp_lib == fft_params::fft_mp_lib_none) { std::cout << "--mp_launch requires an mp library (see mp_lib in --help).\n"; std::exit(-1); } }) ->needs("--mp_lib"); // FIXME: Seed has no use currently // CLI::Option* opt_seed = app.add_option("--seed", random_seed, "Random seed; if unset, use an actual random seed"); app.add_flag("--smoketest", "Run a short (approx 5 minute) randomized selection of tests") ->each([&](const std::string&) { // The objective is to have an test that takes about 5 minutes, so just set the probability // per test to a small value to achieve this result. test_prob = 0.002; }); // Try parsing initial args that will be used to configure tests // Allow extras to pass on gtest and hipFFT arguments without error app.allow_extras(); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // NB: If we initialize gtest first, then it removes all of its own command-line // arguments and sets argc and argv correctly; ::testing::InitGoogleTest(&argc, argv); // Filename for fftw and fftwf wisdom. std::string fftw_wisdom_filename; // Token string to fully specify fft params for the manual test. std::string test_token; // Filename for precompiled kernels to be written to std::string precompile_file; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("--callback", "Inject load/store callbacks")->each([&](const std::string&) { manual_params.run_callbacks = true; }); // app.add_flag("--version", "Print queryable version information from the rocfft library") // ->each([](const std::string&) { // rocfft_setup(); // char v[256]; // rocfft_get_version_string(v, 256); // std::cout << "rocFFT version: " << v << std::endl; // return EXIT_SUCCESS; // }); CLI::Option* opt_token = app.add_option("--token", test_token, "Test token name for manual test")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { manual_params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", manual_params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option("--precision", manual_params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { manual_params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", manual_params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", manual_params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token->add_option("--length", manual_params.length, "Lengths")->expected(1, 3); non_token ->add_option("-b, --batchSize", manual_params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); non_token->add_option("--istride", manual_params.istride, "Input stride"); non_token->add_option("--ostride", manual_params.ostride, "Output stride"); non_token->add_option("--idist", manual_params.idist, "Logical distance between input batches") ->default_val(0); non_token->add_option("--odist", manual_params.odist, "Logical distance between output batches") ->default_val(0); non_token->add_option("--ioffset", manual_params.ioffset, "Input offset"); non_token->add_option("--ooffset", manual_params.ooffset, "Output offset"); app.add_option("--isize", manual_params.isize, "Logical size of input buffer"); app.add_option("--osize", manual_params.osize, "Logical size of output buffer"); app.add_option("--R", ramgb, "RAM limit in GiB for tests") ->default_val(host_memory::singleton().get_total_gbytes()); app.add_option("--V", vramgb, "VRAM limit in GiB for tests")->default_val(0); app.add_option("--half_epsilon", half_epsilon)->default_val(9.77e-4); app.add_option("--single_epsilon", single_epsilon)->default_val(3.75e-5); app.add_option("--double_epsilon", double_epsilon)->default_val(1e-15); app.add_option("--skip_runtime_fails", skip_runtime_fails, "Skip the test if there is a runtime failure") ->default_val(true); app.add_option("-w, --wise", use_fftw_wisdom, "Use FFTW wisdom"); app.add_option("-W, --wisdomfile", fftw_wisdom_filename, "FFTW3 wisdom filename") ->default_val("wisdom3.txt"); app.add_option("--scalefactor", manual_params.scale_factor, "Scale factor to apply to output"); app.add_option("--precompile", precompile_file, "Precompile kernels to a file for all test cases before running tests") ->default_val(""); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", manual_params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); // Parse rest of args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(*opt_help) { std::cout << app.help() << "\n"; return EXIT_SUCCESS; } // Ensure there are no leftover options used by neither gtest nor CLI11 std::vector remaining_args = app.remaining(); if(!remaining_args.empty()) { std::cout << "Unrecognised option(s) found:\n "; for(auto i : app.remaining()) std::cout << i << " "; std::cout << "\nRun with --help for more information.\n"; return EXIT_FAILURE; } std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon << std::endl; if(manual_params.length.empty()) { manual_params.length.push_back(8); // TODO: add random size? } if(manual_params.istride.empty()) { manual_params.istride.push_back(1); // TODO: add random size? } if(manual_params.ostride.empty()) { manual_params.ostride.push_back(1); // TODO: add random size? } // if precompiling, tell rocFFT to use the specified cache file // to write kernels to // // but if our environment already has a cache file for RTC, then // we should just use that std::unique_ptr env_precompile; if(!precompile_file.empty() && rocfft_getenv("ROCFFT_RTC_CACHE_PATH").empty()) { env_precompile = std::make_unique("ROCFFT_RTC_CACHE_PATH", precompile_file.c_str()); } // rocfft_setup(); // char v[256]; // rocfft_get_version_string(v, 256); // std::cout << "rocFFT version: " << v << std::endl; #ifdef FFTW_MULTITHREAD fftw_init_threads(); fftwf_init_threads(); fftw_plan_with_nthreads(rocfft_concurrency()); fftwf_plan_with_nthreads(rocfft_concurrency()); #endif // Set host memory limit from command-line options host_memory::singleton().set_limit_gbytes(ramgb); if(use_fftw_wisdom) { if(verbose) { std::cout << "Using " << fftw_wisdom_filename << " wisdom file\n"; } std::ifstream fftw_wisdom_file(fftw_wisdom_filename); std::string allwisdom = std::string(std::istreambuf_iterator(fftw_wisdom_file), std::istreambuf_iterator()); std::string fftw_wisdom; std::string fftwf_wisdom; bool load_wisdom = false; bool load_fwisdom = false; std::istringstream input; input.str(allwisdom); // Separate the single-precision and double-precision wisdom: for(std::string line; std::getline(input, line);) { if(line.rfind("(fftw", 0) == 0 && line.find("fftw_wisdom") != std::string::npos) { load_wisdom = true; } if(line.rfind("(fftw", 0) == 0 && line.find("fftwf_wisdom") != std::string::npos) { load_fwisdom = true; } if(load_wisdom) { fftw_wisdom.append(line + "\n"); } if(load_fwisdom) { fftwf_wisdom.append(line + "\n"); } if(line.rfind(")", 0) == 0) { load_wisdom = false; load_fwisdom = false; } } fftw_import_wisdom_from_string(fftw_wisdom.c_str()); fftwf_import_wisdom_from_string(fftwf_wisdom.c_str()); } if(!test_token.empty()) { std::cout << "Reading fft params from token:\n" << test_token << std::endl; try { manual_params.from_token(test_token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return 1; } } if(!precompile_file.empty()) precompile_test_kernels(precompile_file); auto retval = RUN_ALL_TESTS(); if(use_fftw_wisdom) { std::string fftw_wisdom = std::string(fftw_export_wisdom_to_string()); std::string fftwf_wisdom = std::string(fftwf_export_wisdom_to_string()); fftw_wisdom.append(std::string(fftwf_export_wisdom_to_string())); std::ofstream fftw_wisdom_file(fftw_wisdom_filename); fftw_wisdom_file << fftw_wisdom; fftw_wisdom_file << fftwf_wisdom; fftw_wisdom_file.close(); } std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << std::endl; std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << std::endl; std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << std::endl; std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << std::endl; std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << std::endl; std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << std::endl; // rocfft_cleanup(); return retval; } TEST(manual, vs_fftw) { // Run an individual test using the provided command-line parameters. std::cout << "Manual test:" << std::endl; manual_params.validate(); std::cout << "Token: " << manual_params.token() << std::endl; hipfft_params params(manual_params); try { fft_vs_reference(params, false); } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear test cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } } hipFFT-rocm-6.4.3/clients/tests/hipfft_accuracy_test.cpp000066400000000000000000000566501501537340400233200ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include #include "hipfft/hipfft.h" #include "../hipfft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/rocfft_complex.h" #include "../../shared/subprocess.h" extern std::string mp_launch; extern last_cpu_fft_cache last_cpu_fft_data; void fft_vs_reference(hipfft_params& params, bool round_trip) { switch(params.precision) { case fft_precision_half: fft_vs_reference_impl(params, round_trip); break; case fft_precision_single: fft_vs_reference_impl(params, round_trip); break; case fft_precision_double: fft_vs_reference_impl(params, round_trip); break; } } // Test for comparison between FFTW and hipFFT. TEST_P(accuracy_test, vs_fftw) { hipfft_params params(GetParam()); params.validate(); if(!params.valid(verbose)) { if(verbose) { std::cout << "Invalid parameters, skip this test." << std::endl; } GTEST_SKIP(); } switch(params.mp_lib) { case fft_params::fft_mp_lib_none: { // only do round trip for single-GPU FFTs bool round_trip = params.multiGPU <= 1; if(!params.run_callbacks) { try { fft_vs_reference(params, round_trip); } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } } break; } case fft_params::fft_mp_lib_mpi: { // split launcher into tokens since the first one is the exe // and the remainder is the start of its argv boost::escaped_list_separator sep('\\', ' ', '\"'); boost::tokenizer> tokenizer(mp_launch, sep); std::string exe; std::vector argv; for(auto t : tokenizer) { if(t.empty()) continue; if(exe.empty()) exe = t; else argv.push_back(t); } // append test token and ask for accuracy test argv.push_back("--token"); argv.push_back(params.token()); argv.push_back("--accuracy"); // throws an exception if launch fails or if subprocess // returns nonzero exit code execute_subprocess(exe, argv, {}); break; } default: GTEST_FAIL() << "Invalid communicator choice!"; break; } SUCCEED(); } #ifdef __HIP__ // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // multiply each element by scalar if(input == testdata->base) return input[offset] * testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_dev_half = load_callback; __device__ auto load_callback_dev_complex_half = load_callback>; __device__ auto load_callback_dev_float = load_callback; __device__ auto load_callback_dev_complex_float = load_callback>; __device__ auto load_callback_dev_double = load_callback; __device__ auto load_callback_dev_complex_double = load_callback>; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { void* load_callback_host = nullptr; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_half), sizeof(void*)), hipSuccess); return load_callback_host; case fft_precision_single: EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_float), sizeof(void*)), hipSuccess); return load_callback_host; case fft_precision_double: EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_double), sizeof(void*)), hipSuccess); return load_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_dev_half), sizeof(void*)), hipSuccess); return load_callback_host; case fft_precision_single: EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)), hipSuccess); return load_callback_host; case fft_precision_double: EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)), hipSuccess); return load_callback_host; } } default: // planar is unsupported for now return load_callback_host; } } template __host__ __device__ static void store_callback(Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element + testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_dev_half = store_callback; __device__ auto store_callback_dev_complex_half = store_callback>; __device__ auto store_callback_dev_float = store_callback; __device__ auto store_callback_dev_complex_float = store_callback>; __device__ auto store_callback_dev_double = store_callback; __device__ auto store_callback_dev_complex_double = store_callback>; void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { void* store_callback_host = nullptr; switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_half), sizeof(void*)), hipSuccess); return store_callback_host; case fft_precision_single: EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_float), sizeof(void*)), hipSuccess); return store_callback_host; case fft_precision_double: EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_double), sizeof(void*)), hipSuccess); return store_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_dev_half), sizeof(void*)), hipSuccess); return store_callback_host; case fft_precision_single: EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_float), sizeof(void*)), hipSuccess); return store_callback_host; case fft_precision_double: EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_double), sizeof(void*)), hipSuccess); return store_callback_host; } } default: // planar is unsupported for now return store_callback_host; } } // implement result scaling as a store callback, as rocFFT tests do void apply_store_callback(const fft_params& params, std::vector& output) { if(!params.run_callbacks && params.scale_factor == 1.0) return; callback_test_data cbdata; cbdata.scalar = params.store_cb_scalar; cbdata.base = output.front().data(); switch(params.otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { // planar wouldn't run callbacks, but we could still want scaling switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(std::complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(std::complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(std::complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } // apply load callback if necessary void apply_load_callback(const fft_params& params, std::vector& input) { if(!params.run_callbacks) return; // we're applying callbacks to FFTW input/output which we can // assume is contiguous and non-planar callback_test_data cbdata; cbdata.scalar = params.load_cb_scalar; cbdata.base = input.front().data(); switch(params.itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } #else // Stubs for callback tests. // Many seem to be called unconditionally, so we can't throw exceptions in // most cases. void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { return nullptr; } void apply_load_callback(const fft_params& params, std::vector& input) {} // implement result scaling as a store callback, as rocFFT tests do void apply_store_callback(const fft_params& params, std::vector& output) { if(params.scale_factor == 1.0) return; switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; element = static_cast(element) * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; element = element * params.scale_factor; } } break; } } } void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { throw std::runtime_error("get_store_callback_host not implemented"); return nullptr; } #endif hipFFT-rocm-6.4.3/clients/tests/hipfft_accuracy_test.h000066400000000000000000000025431501537340400227550ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef ROCFFT_ACCURACY_TEST #define ROCFFT_ACCURACY_TEST #include "../../shared/accuracy_test.h" #include "../hipfft_params.h" void fft_vs_reference(hipfft_params& params, bool round_trip = false); #endif hipFFT-rocm-6.4.3/clients/tests/hipfft_mpi_worker.cpp000066400000000000000000000031271501537340400226340ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "../../shared/mpi_worker.h" #include "../hipfft_params.h" int main(int argc, char* argv[]) { return mpi_worker_main, false>( "hipFFT MPI worker process", argc, argv, [](const std::vector& lib_strings) { return std::array(); }); } hipFFT-rocm-6.4.3/clients/tests/hipfft_test_params.h000066400000000000000000000024501501537340400224430ustar00rootroot00000000000000// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef TESTCONSTANTS_H #define TESTCONSTANTS_H #include "hipfft/hipfft.h" #include extern int verbose; extern size_t ramgb; #endif hipFFT-rocm-6.4.3/clients/tests/multi_device_test.cpp000066400000000000000000000230321501537340400226230ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "../hipfft_params.h" #include #include #include #include extern fft_params::fft_mp_lib mp_lib; extern int mp_ranks; static const std::vector> multi_gpu_sizes = { {128, 256}, {64, 128, 256}, }; enum SplitType { // split both input and output on slow FFT dimension SLOW_INOUT, // split only input on slow FFT dimension, output is not split SLOW_IN, // split only output on slow FFT dimension, input is not split SLOW_OUT, // split input on slow FFT dimension, and output on fast FFT dimension SLOW_IN_FAST_OUT, // 3D pencil decomposition - one dimension is contiguous on input // and another dimension contiguous on output, remaining dims are // both split PENCIL_3D, }; std::vector param_generator_multi_gpu(const std::optional type) { int localDeviceCount = 0; (void)hipGetDeviceCount(&localDeviceCount); // if we have an explicit split of data on the user side, we need // to use the multiprocessing API if(type) { if(mp_lib == fft_params::fft_mp_lib_none) return {}; } // data is not explicitly split up, that means the library is // asked to do the split. We need multiple GPUs to do this. else if(localDeviceCount < 2) return {}; static const std::vector> stride_range = {{1}}; auto params_complex = param_generator_complex(test_prob, multi_gpu_sizes, precision_range_sp_dp, {1, 10}, stride_generator(stride_range), stride_generator(stride_range), {{0, 0}}, {{0, 0}}, {fft_placement_inplace, fft_placement_notinplace}, false); auto params_real = param_generator_real(test_prob, multi_gpu_sizes, precision_range_sp_dp, {1, 10}, stride_generator(stride_range), stride_generator(stride_range), {{0, 0}}, {{0, 0}}, {fft_placement_notinplace}, false); std::vector all_params; auto distribute_params = [=, &all_params](const std::vector& params) { for(auto& p : params) { // test library splitting if(!type) { auto param_multi = p; // for single-batch, cuFFT only allows in-place if(p.nbatch == 1 && p.placement == fft_placement_notinplace) continue; param_multi.multiGPU = std::min(static_cast(p.nbatch), localDeviceCount); all_params.emplace_back(std::move(param_multi)); } else { // the API only allows for batch-1 multi-process FFTs if(p.nbatch > 1) continue; // user-specified split int brickCount = mp_ranks; // start with all-ones in grids std::vector input_grid(p.length.size() + 1, 1); std::vector output_grid(p.length.size() + 1, 1); auto p_dist = p; switch(*type) { case SLOW_INOUT: input_grid[1] = brickCount; output_grid[1] = brickCount; break; case SLOW_IN: // this type only specifies input field and no output // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; input_grid[1] = brickCount; break; case SLOW_OUT: // this type only specifies output field and no input // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; output_grid[1] = brickCount; break; case SLOW_IN_FAST_OUT: // requires at least rank-2 FFT if(p.length.size() < 2) continue; input_grid[1] = brickCount; output_grid.back() = brickCount; break; case PENCIL_3D: // need at least 2 bricks per split dimension, or 4 devices. // also needs to be a 3D problem. if(brickCount < 4 || p.length.size() != 3) continue; // make fast dimension contiguous on input input_grid[1] = static_cast(sqrt(brickCount)); input_grid[2] = brickCount / input_grid[1]; // make middle dimension contiguous on output output_grid[1] = input_grid[1]; output_grid[3] = input_grid[2]; break; } p_dist.mp_lib = mp_lib; p_dist.distribute_input(localDeviceCount, input_grid); p_dist.distribute_output(localDeviceCount, output_grid); // "placement" flag is meaningless if exactly one of // input+output is a field. So just add those cases if // the flag is "out-of-place", since "in-place" is // exactly the same test case. if(p_dist.placement == fft_placement_inplace && p_dist.ifields.empty() != p_dist.ofields.empty()) continue; // in-place transforms require identical input/output layouts if(p.placement == fft_placement_inplace && input_grid != output_grid) continue; all_params.push_back(std::move(p_dist)); } } }; distribute_params(params_complex); distribute_params(params_real); return all_params; } // split both input and output on slowest FFT dim INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT)), accuracy_test::TestName); // split slowest FFT dim only on input, or only on output INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_input_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_output_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT)), accuracy_test::TestName); // split input on slowest FFT and output on fastest, to minimize data // movement (only makes sense for rank-2 and higher FFTs) INSTANTIATE_TEST_SUITE_P(multi_gpu_slowin_fastout, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT)), accuracy_test::TestName); // 3D pencil decompositions INSTANTIATE_TEST_SUITE_P(multi_gpu_3d_pencils, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D)), accuracy_test::TestName); // library-decided splits INSTANTIATE_TEST_SUITE_P(multi_gpu, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu({})), accuracy_test::TestName); hipFFT-rocm-6.4.3/clients/tests/simple_test.cpp000066400000000000000000000606411501537340400214520ustar00rootroot00000000000000// Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "hipfft/hipfft.h" #include #include #include #include #include "../hipfft_params.h" DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP // Function to return maximum error for float and double types. template inline double type_epsilon(); template <> inline double type_epsilon() { return 1e-6; } template <> inline double type_epsilon() { return 1e-7; } TEST(hipfftTest, Create1dPlan) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t length = 1024; ASSERT_EQ(hipfftPlan1d(&plan, length, HIPFFT_C2C, 1), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CreatePlanMany) { int const rank = 3; int const nX = 64; int const nY = 128; int const nZ = 23; int n[3] = {nX, nY, nZ}; int inembed[3] = {nX, nY, nZ}; int* inembed_null = nullptr; int const istride = 1; int const idist = nX * nY * nZ; int onembed[3] = {nX, nY, nZ}; int* onembed_null = nullptr; int const ostride = 1; int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; int const batch = 1000; size_t workSize; // Tests plan creation with null and not null // combinations of inembed and onembed. // // Valid combinations: // inembed == null && onembed == null // or // inembed != null && onembed != null // // otherwise HIPFFT_INVALID_VALUE should be // returned to maintain compatibility with cuFFT // inembed == null && onembed == null { hipfftHandle plan_valid_1 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_valid_1), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_valid_1, rank, (int*)n, inembed_null, istride, idist, onembed_null, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS) << "inembed == null && onembed == null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftSetAutoAllocation(plan_valid_1, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_valid_1), HIPFFT_SUCCESS); } // inembed != null && onembed != null { hipfftHandle plan_valid_2 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_valid_2), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_valid_2, rank, (int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS) << "inembed != null && onembed != null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftSetAutoAllocation(plan_valid_2, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_valid_2), HIPFFT_SUCCESS); } // inembed != null && onembed == null { hipfftHandle plan_invalid_1 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_1), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_invalid_1, rank, (int*)n, inembed, istride, idist, onembed_null, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_VALUE) << "inembed != null && onembed == null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftDestroy(plan_invalid_1), HIPFFT_SUCCESS); } // inembed == null && onembed != null { hipfftHandle plan_invalid_2 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_2), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_invalid_2, rank, (int*)n, inembed_null, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_VALUE) << "inembed == null && onembed != null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftDestroy(plan_invalid_2), HIPFFT_SUCCESS); } } TEST(hipfftTest, CreatePlanMany64) { int const rank = 3; long long int const nX = 64; long long int const nY = 128; long long int const nZ = 23; long long int n[3] = {nX, nY, nZ}; long long int inembed[3] = {nX, nY, nZ}; long long int const istride = 1; long long int const idist = nX * nY * nZ; long long int onembed[3] = {nX, nY, nZ}; long long int onembed_invalid[3] = {nX, nY, -nZ}; long long int const ostride = 1; long long int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; long long int const batch = 1000; long long int const batch_invalid = -2; size_t workSize; // Tests the 64-bit version of plan creation // with valid/invalid data layouts. // First test with a valid data layout { hipfftHandle plan_valid = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_valid), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_valid, rank, (long long int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS); ASSERT_EQ(hipfftSetAutoAllocation(plan_valid, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_valid), HIPFFT_SUCCESS); } // invalid data layout (n array has a negative entry). only test rocFFT // backend, since it's more strict #ifdef __HIP_PLATFORM_AMD__ long long int n_invalid[3] = {nX, -nY, nZ}; { hipfftHandle plan_invalid_1 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_1), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_invalid_1, rank, (long long int*)n_invalid, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_VALUE); ASSERT_EQ(hipfftSetAutoAllocation(plan_invalid_1, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_invalid_1), HIPFFT_SUCCESS); } #endif // invalid data layout (onembed array has a negative entry) { hipfftHandle plan_invalid_2 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_2), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_invalid_2, rank, (long long int*)n, inembed, istride, idist, onembed_invalid, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_SIZE); ASSERT_EQ(hipfftSetAutoAllocation(plan_invalid_2, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_invalid_2), HIPFFT_SUCCESS); } // invalid data layout (batch is negative) { hipfftHandle plan_invalid_3 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_3), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_invalid_3, rank, (long long int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch_invalid, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_SIZE); ASSERT_EQ(hipfftSetAutoAllocation(plan_invalid_3, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_invalid_3), HIPFFT_SUCCESS); } } TEST(hipfftTest, hipfftGetSizeMany) { int const rank = 3; int const nX = 33; int const nY = 128; int const nZ = 100; int n[3] = {nX, nY, nZ}; int inembed[3] = {nX, nY, nZ}; int const istride = 1; int const idist = nX * nY * nZ; int onembed[3] = {nX, nY, nZ}; int const ostride = 1; int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; int const batch = 1; size_t workSize; hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); auto ret_hipfft = hipfftGetSizeMany(plan, rank, (int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS); ASSERT_EQ(hipfftSetAutoAllocation(plan, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, hipfftGetSizeMany64) { int const rank = 3; long long int const nX = 133; long long int const nY = 354; long long int const nZ = 256; long long int n[3] = {nX, nY, nZ}; long long int inembed[3] = {nX, nY, nZ}; long long int const istride = 1; long long int const idist = nX * nY * nZ; long long int onembed[3] = {nX, nY, nZ}; long long int const ostride = 1; long long int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; long long int const batch = 2; size_t workSize; hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); auto ret_hipfft = hipfftGetSizeMany64(plan, rank, (long long int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS); ASSERT_EQ(hipfftSetAutoAllocation(plan, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeC2C) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 1024; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_C2C, 1, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // No extra work buffer for C2C EXPECT_EQ(workSize, 0); #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeR2C) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); // real forward transform cannot modify input, so we need to pick // a sufficiently small N such that rocFFT can fuse // post-processing into one kernel and avoid a temp buffer size_t n = 256; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_R2C, 1, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(float)); } #endif EXPECT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeC2R) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 2048; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_C2R, 1, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(float)); } #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeD2Z) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); // real forward transform cannot modify input, so we need to pick // a sufficiently small N such that rocFFT can fuse // post-processing into one kernel and avoid a temp buffer size_t n = 256; size_t batch = 1000; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_D2Z, batch, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(double)); } #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeZ2D) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 2048; size_t batch = 1000; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_Z2D, batch, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(double)); } #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } #ifdef __HIP_PLATFORM_AMD__ TEST(hipfftTest, CheckNullWorkBuffer) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 2048; size_t batch = 1000; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_Z2D, batch, &workSize), HIPFFT_SUCCESS); EXPECT_EQ(hipfftSetWorkArea(plan, nullptr), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } #endif TEST(hipfftTest, RunR2C) { const size_t N = 4096; float in[N]; for(size_t i = 0; i < N; i++) in[i] = i + (i % 3) - (i % 7); hipfftReal* d_in; hipfftComplex* d_out; ASSERT_EQ(hipMalloc(&d_in, N * sizeof(hipfftReal)), hipSuccess); ASSERT_EQ(hipMalloc(&d_out, (N / 2 + 1) * sizeof(hipfftComplex)), hipSuccess); ASSERT_EQ(hipMemcpy(d_in, in, N * sizeof(hipfftReal), hipMemcpyHostToDevice), hipSuccess); hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t workSize; ASSERT_EQ(hipfftMakePlan1d(plan, N, HIPFFT_R2C, 1, &workSize), HIPFFT_SUCCESS); EXPECT_EQ(hipfftExecR2C(plan, d_in, d_out), HIPFFT_SUCCESS); std::vector out(N / 2 + 1); ASSERT_EQ(hipMemcpy(&out[0], d_out, (N / 2 + 1) * sizeof(hipfftComplex), hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); ASSERT_EQ(hipFree(d_in), hipSuccess); ASSERT_EQ(hipFree(d_out), hipSuccess); ; // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(N % 2 != 0) { EXPECT_TRUE(workSize != 0); } double ref_in[N]; for(size_t i = 0; i < N; i++) ref_in[i] = in[i]; fftw_complex* ref_out; fftw_plan ref_p; ref_out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * (N / 2 + 1)); ref_p = fftw_plan_dft_r2c_1d(N, ref_in, ref_out, FFTW_ESTIMATE); fftw_execute(ref_p); double maxv = 0; double nrmse = 0; // normalized root mean square error for(size_t i = 0; i < (N / 2 + 1); i++) { // printf("element %d: FFTW result %f, %f; hipFFT result %f, %f \n", (int)i, ref_out[i][0], ref_out[i][1], out[i].x, out[i].y); double dr = ref_out[i][0] - out[i].x; double di = ref_out[i][1] - out[i].y; maxv = fabs(ref_out[i][0]) > maxv ? fabs(ref_out[i][0]) : maxv; maxv = fabs(ref_out[i][1]) > maxv ? fabs(ref_out[i][1]) : maxv; nrmse += ((dr * dr) + (di * di)); } nrmse /= (double)((N / 2 + 1)); nrmse = sqrt(nrmse); nrmse /= maxv; EXPECT_TRUE(nrmse < type_epsilon()); fftw_destroy_plan(ref_p); fftw_free(ref_out); } // ask for a transform whose parameters are only valid out-of-place. // since hipFFT generates both in-place and out-place plans up front // (because it's not told about the placement until exec time), this // ensures that a failure to create an in-place plan doesn't prevent // the out-place plan from working. TEST(hipfftTest, OutplaceOnly) { static const int N_in_const = 4; static const int N_out_const = N_in_const / 2 + 1; // mutable sizes for passing to hipFFT int N_in = N_in_const; int N_out = N_out_const; float in[N_in_const]; for(int i = 0; i < N_in; i++) in[i] = i + (i % 3) - (i % 7); hipfftReal* d_in; hipfftComplex* d_out; ASSERT_EQ(hipMalloc(&d_in, N_in * sizeof(hipfftReal)), hipSuccess); ASSERT_EQ(hipMalloc(&d_out, N_out * sizeof(hipfftComplex)), hipSuccess); ASSERT_EQ(hipMemcpy(d_in, in, N_in * sizeof(hipfftReal), hipMemcpyHostToDevice), hipSuccess); hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); ASSERT_EQ(hipfftPlanMany(&plan, 1, &N_in, &N_in, 1, N_in, &N_out, 1, N_out, HIPFFT_R2C, 1), HIPFFT_SUCCESS); ASSERT_EQ(plan == hipfft_params::INVALID_PLAN_HANDLE, false); ASSERT_EQ(hipfftExecR2C(plan, d_in, d_out), HIPFFT_SUCCESS) << "hipfftExecR2C failed"; std::vector out(N_out); ASSERT_EQ(hipMemcpy(out.data(), d_out, N_out * sizeof(hipfftComplex), hipMemcpyDeviceToHost), hipSuccess); // in-place transform isn't really *supposed* to work - this // might or might not fail but we can at least check that it // doesn't blow up. //hipfftExecR2C(plan, reinterpret_cast(d_out), d_out); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); ASSERT_EQ(hipFree(d_in), hipSuccess); ASSERT_EQ(hipFree(d_out), hipSuccess); double ref_in[N_in_const]; for(int i = 0; i < N_in_const; i++) ref_in[i] = in[i]; fftw_complex* ref_out; fftw_plan ref_p; ref_out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N_out); ref_p = fftw_plan_dft_r2c_1d(N_in, ref_in, ref_out, FFTW_ESTIMATE); fftw_execute(ref_p); double maxv = 0; double nrmse = 0; // normalized root mean square error for(int i = 0; i < N_out; i++) { // printf("element %d: FFTW result %f, %f; hipFFT result %f, %f \n", (int)i, ref_out[i][0], ref_out[i][1], out[i].x, out[i].y); double dr = ref_out[i][0] - out[i].x; double di = ref_out[i][1] - out[i].y; maxv = fabs(ref_out[i][0]) > maxv ? fabs(ref_out[i][0]) : maxv; maxv = fabs(ref_out[i][1]) > maxv ? fabs(ref_out[i][1]) : maxv; nrmse += ((dr * dr) + (di * di)); } nrmse /= (double)(N_out); nrmse = sqrt(nrmse); nrmse /= maxv; ASSERT_TRUE(nrmse < type_epsilon()); fftw_destroy_plan(ref_p); fftw_free(ref_out); } hipFFT-rocm-6.4.3/cmake/000077500000000000000000000000001501537340400146645ustar00rootroot00000000000000hipFFT-rocm-6.4.3/cmake/dependencies.cmake000066400000000000000000000101351501537340400203140ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# # HIP if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP REQUIRED ) endif() list( APPEND HIP_INCLUDE_DIRS "${HIP_ROOT_DIR}/include" ) endif() else() if( BUILD_WITH_LIB STREQUAL "CUDA" ) set(HIP_INCLUDE_DIRS "${HIP_ROOT_DIR}/include") else() if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP REQUIRED ) endif() endif() endif() # Either rocfft or cufft is required if(NOT BUILD_WITH_LIB STREQUAL "CUDA") if( HIPFFT_MPI_ENABLE ) find_package( MPI REQUIRED ) endif() find_package(rocfft REQUIRED) else() # cufft may be in the HPC SDK or ordinary CUDA if( HIPFFT_MPI_ENABLE ) if( NOT BUILD_SHARED_LIBS ) message( FATAL_ERROR "cufftMp is shared-only, static build is not possible" ) endif() # MPI support is only in HPC SDK find_package(NVHPC REQUIRED COMPONENTS CUDA MATH MPI) else() find_package(NVHPC QUIET COMPONENTS CUDA MATH) endif() set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) find_package(CUDAToolkit REQUIRED) endif() # ROCm find_package( ROCM 0.7.3 CONFIG PATHS /opt/rocm ) if(NOT ROCM_FOUND) set( rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download" ) set( PROJECT_EXTERN_DIR "${CMAKE_CURRENT_BINARY_DIR}/extern" ) file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log) list(GET status 0 status_code) list(GET status 1 status_string) if(NOT status_code EQUAL 0) message(WARNING "error: downloading 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed status_code: ${status_code} status_string: ${status_string} log: ${log} ") else() message(STATUS "downloading... done") execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzvf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} ) execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) find_package( ROCM 0.7.3 CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif() endif() if( ROCM_FOUND ) message(STATUS "Found ROCm") include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMCheckTargetIds) include(ROCMClients) include(ROCMHeaderWrapper) else() message(WARNING "Could not find rocm-cmake, packaging will fail.") endif( ) hipFFT-rocm-6.4.3/cmake/get-cli-arguments.cmake000066400000000000000000000044221501537340400212170ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# # Attempt (best effort) to return a list of user specified parameters cmake was invoked with # NOTE: Even if the user specifies CMAKE_INSTALL_PREFIX on the command line, the parameter is # not returned because it does not have the matching helpstring function( append_cmake_cli_arguments initial_cli_args return_cli_args ) # Retrieves the contents of CMakeCache.txt get_cmake_property( cmake_properties CACHE_VARIABLES ) foreach( property ${cmake_properties} ) get_property(help_string CACHE ${property} PROPERTY HELPSTRING ) # Properties specified on the command line have boilerplate text if( help_string MATCHES "variable specified on the command line" ) # message( STATUS "property: ${property}") # message( STATUS "value: ${${property}}") list( APPEND cli_args "-D${property}=${${property}}") endif( ) endforeach( ) # message( STATUS "get_command_line_arguments: ${cli_args}") set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) endfunction( )hipFFT-rocm-6.4.3/cmake/package-functions.cmake000066400000000000000000000043271501537340400212750ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# # ######################################################################## # A helper function to generate packaging scripts to register libraries with system # ######################################################################## function( write_rocm_package_script_files scripts_write_dir library_name library_link_name ) set( ld_conf_file "/etc/ld.so.conf.d/${library_name}-dev.conf" ) file( WRITE ${scripts_write_dir}/postinst "#!/bin/bash set -e do_ldconfig() { echo ${CPACK_PACKAGING_INSTALL_PREFIX}/${LIB_INSTALL_DIR} > ${ld_conf_file} && ldconfig } case \"\$1\" in configure) do_ldconfig ;; abort-upgrade|abort-remove|abort-deconfigure) echo \"\$1\" ;; *) exit 0 ;; esac " ) file( WRITE ${scripts_write_dir}/prerm "#!/bin/bash set -e rm_ldconfig() { rm -f ${ld_conf_file} && ldconfig } case \"\$1\" in remove|purge) rm_ldconfig ;; *) exit 0 ;; esac " ) endfunction( ) hipFFT-rocm-6.4.3/cmake/verbose.cmake000066400000000000000000000062641501537340400173430ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# message(STATUS "hipfft_VERSION : ${hipfft_VERSION}") message(STATUS "\t==>CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE}") message(STATUS "\t==>BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") message(STATUS "\t==>CMAKE_INSTALL_PREFIX link : ${CMAKE_INSTALL_PREFIX}") message(STATUS "\t==>CMAKE_MODULE_PATH link : ${CMAKE_MODULE_PATH}") message(STATUS "\t==>CMAKE_PREFIX_PATH link : ${CMAKE_PREFIX_PATH}") message(STATUS "==============") message(STATUS "\t==>CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}") message(STATUS "\t>>=HIP_ROOT_DIR : ${HIP_ROOT_DIR}") message(STATUS "\t==>CMAKE_CXX_COMPILER : ${CMAKE_CXX_FLAGS}") message(STATUS "\t==>CMAKE_CXX_COMPILER_VERSION : ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "\t==>CMAKE_CXX_COMPILER debug : ${CMAKE_CXX_FLAGS_DEBUG}") message(STATUS "\t==>CMAKE_CXX_COMPILER release : ${CMAKE_CXX_FLAGS_RELEASE}") message(STATUS "\t==>CMAKE_CXX_COMPILER relwithdebinfo : ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS : ${CMAKE_EXE_LINKER_FLAGS}") message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS_RELEASE : ${CMAKE_EXE_LINKER_FLAGS_RELEASE}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS : ${CMAKE_SHARED_LINKER_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE : ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") message(STATUS "==============" ) message(STATUS "\t==>CMAKE_SHARED_LIBRARY_C_FLAGS : ${CMAKE_SHARED_LIBRARY_C_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LIBRARY_CXX_FLAGS : ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS : ${CMAKE_SHARED_LINKER_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_DEBUG : ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE : ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}")hipFFT-rocm-6.4.3/deps/000077500000000000000000000000001501537340400145375ustar00rootroot00000000000000hipFFT-rocm-6.4.3/deps/CMakeLists.txt000066400000000000000000000102741501537340400173030ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# # Helper cmake script to automate building dependencies for hipfft # This script can be invoked manually by the user with 'cmake -P' # The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 cmake_minimum_required( VERSION 3.5 ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) # Consider removing this in the future # It can be annoying for visual studio developers to build a project that tries to install into 'program files' if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() # The superbuild does not build anything itself; all compiling is done in external projects project( hipfft-dependencies NONE ) option( BUILD_BOOST "Download and build boost library" ON ) option( BUILD_GTEST "Download and build googletest library" ON ) # option( BUILD_VERBOSE "Print helpful build debug information" OFF ) # if( BUILD_VERBOSE ) # message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) # message( STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}" ) # message( STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_FILE: ${CMAKE_CURRENT_LIST_FILE}" ) # endif( ) # This module scrapes the CMakeCache.txt file and attempts to get all the cli options the user specified to cmake invocation include( get-cli-arguments ) # The following is a series of super-build projects; this cmake project will download and build if( BUILD_GTEST ) include( external-gtest ) list( APPEND hipfft_dependencies googletest ) set( gtest_custom_target COMMAND cd ${GTEST_BINARY_ROOT}$ ${CMAKE_COMMAND} --build . --target install ) endif( ) if( BUILD_BOOST ) include( external-boost ) list( APPEND hipfft_dependencies boost ) set( boost_custom_target COMMAND cd ${BOOST_BINARY_ROOT}$ ${Boost.Command} install ) endif( ) # POLICY CMP0037 - "Target names should not be reserved and should match a validity pattern" # Familiar target names like 'install' should be OK at the super-build level if( POLICY CMP0037 ) cmake_policy( SET CMP0037 OLD ) endif( ) add_custom_target( install ${boost_custom_target} ${gtest_custom_target} DEPENDS ${hipfft_dependencies} ) hipFFT-rocm-6.4.3/deps/external-boost.cmake000066400000000000000000000170471501537340400205200ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# message( STATUS "Configuring boost external dependency" ) include( ExternalProject ) set( PREFIX_BOOST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) # We need to detect the compiler the user is attempting to invoke with CMake, # we do our best to translate cmake parameters into bjam parameters enable_language( CXX ) include( build-bitness ) # TODO: Options should be added to allow downloading Boost straight from github # This file is used to add Boost as a library dependency to another project # This sets up boost to download from sourceforge, and builds it as a cmake # ExternalProject # Change this one line to upgrade to newer versions of boost set( ext.Boost_VERSION "1.64.0" CACHE STRING "Boost version to download/use" ) mark_as_advanced( ext.Boost_VERSION ) string( REPLACE "." "_" ext.Boost_Version_Underscore ${ext.Boost_VERSION} ) message( STATUS "ext.Boost_VERSION: " ${ext.Boost_VERSION} ) if( WIN32 ) # For newer cmake versions, 7z archives are much smaller to download if( CMAKE_VERSION VERSION_LESS "3.1.0" ) set( Boost_Ext "zip" ) else( ) set( Boost_Ext "7z" ) endif( ) else( ) set( Boost_Ext "tar.bz2" ) endif( ) if( WIN32 ) set( Boost.Command b2 --prefix=${PREFIX_BOOST} ) else( ) set( Boost.Command ./b2 --prefix=${PREFIX_BOOST} ) endif( ) if( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command cxxflags=-fPIC -std=c++11 ) elseif( XCODE_VERSION OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang") ) list( APPEND Boost.Command cxxflags=-std=c++11 -stdlib=libc++ linkflags=-stdlib=libc++ ) endif( ) include( ProcessorCount ) ProcessorCount( Cores ) if( NOT Cores EQUAL 0 ) # Travis can fail to build Boost sporadically; uses 32 cores, reduce stress on VM if( DEFINED ENV{TRAVIS} ) if( Cores GREATER 8 ) set( Cores 8 ) endif( ) endif( ) # Add build thread in addition to the number of cores that we have math( EXPR Cores "${Cores} + 1 " ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread set( Cores "2" ) endif( ) message( STATUS "ExternalBoost using ( " ${Cores} " ) cores to build with" ) message( STATUS "ExternalBoost building [ serialization, filesystem, system, regex ] components" ) list( APPEND Boost.Command -j ${Cores} --with-serialization --with-filesystem --with-system --with-regex ) if( BUILD_64 ) list( APPEND Boost.Command address-model=64 ) else( ) list( APPEND Boost.Command address-model=32 ) endif( ) if( MSVC10 ) list( APPEND Boost.Command toolset=msvc-10.0 ) elseif( MSVC11 ) list( APPEND Boost.Command toolset=msvc-11.0 ) elseif( MSVC12 ) list( APPEND Boost.Command toolset=msvc-12.0 ) elseif( MSVC14 ) list( APPEND Boost.Command toolset=msvc-14.0 ) elseif( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Command toolset=clang ) elseif( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command toolset=gcc ) endif( ) if( WIN32 AND (ext.Boost_VERSION VERSION_LESS "1.60.0") ) list( APPEND Boost.Command define=BOOST_LOG_USE_WINNT6_API ) endif( ) if( NOT DEFINED ext.Boost_LINK ) if( ${BUILD_SHARED_LIBS} MATCHES "ON" ) set( ext.Boost_LINK "shared" CACHE STRING "Which boost link method? static | shared | static,shared" ) else( ) set( ext.Boost_LINK "static" CACHE STRING "Which boost link method? static | shared | static,shared" ) endif( ) endif() mark_as_advanced( ext.Boost_LINK ) if( WIN32 ) # Versioned is the default on windows set( ext.Boost_LAYOUT "versioned" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For windows, default to build both variants to support the VS IDE set( ext.Boost_VARIANT "debug,release" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) # Tagged builds provide unique enough names to be able to build both variants set( ext.Boost_LAYOUT "tagged" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For Linux, typically a build tree only needs one variant if( ${CMAKE_BUILD_TYPE} MATCHES "Debug") set( ext.Boost_VARIANT "debug" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) set( ext.Boost_VARIANT "release" CACHE STRING "Which boost variant? debug | release | debug,release" ) endif( ) endif( ) mark_as_advanced( ext.Boost_LAYOUT ) mark_as_advanced( ext.Boost_VARIANT ) list( APPEND Boost.Command --layout=${ext.Boost_LAYOUT} link=${ext.Boost_LINK} variant=${ext.Boost_VARIANT} ) message( STATUS "Boost.Command: ${Boost.Command}" ) # If the user has a cached local copy stored somewhere, they can define the full path to the package in a BOOST_URL environment variable if( DEFINED ENV{BOOST_URL} ) set( ext.Boost_URL "$ENV{BOOST_URL}" CACHE STRING "URL to download Boost from" ) else( ) set( ext.Boost_URL "http://sourceforge.net/projects/boost/files/boost/${ext.Boost_VERSION}/boost_${ext.Boost_Version_Underscore}.${Boost_Ext}/download" CACHE STRING "URL to download Boost from" ) endif( ) mark_as_advanced( ext.Boost_URL ) set( Boost.Bootstrap "" ) set( ext.HASH "" ) if( WIN32 ) set( Boost.Bootstrap "bootstrap.bat" ) if( CMAKE_VERSION VERSION_LESS "3.1.0" ) # .zip file set( ext.HASH "b99973c805f38b549dbeaf88701c0abeff8b0e8eaa4066df47cac10a32097523" ) else( ) # .7z file set( ext.HASH "49c6abfeb5b480f6a86119c0d57235966b4690ee6ff9e6401ee868244808d155" ) endif( ) else( ) set( Boost.Bootstrap "./bootstrap.sh" ) # .tar.bz2 set( ext.HASH "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332" ) if( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Bootstrap --with-toolset=clang ) endif( ) endif( ) # Below is a fancy CMake command to download, build and install Boost on the users computer ExternalProject_Add( boost PREFIX ${CMAKE_BINARY_DIR}/boost URL ${ext.Boost_URL} URL_HASH SHA256=${ext.HASH} UPDATE_COMMAND ${Boost.Bootstrap} LOG_UPDATE 1 CONFIGURE_COMMAND "" BUILD_COMMAND ${Boost.Command} stage BUILD_IN_SOURCE 1 LOG_BUILD 1 INSTALL_COMMAND "" ) set_property( TARGET boost PROPERTY FOLDER "extern" ) ExternalProject_Get_Property( boost install_dir ) ExternalProject_Get_Property( boost binary_dir ) # For use by the user of ExternalGtest.cmake set( BOOST_INSTALL_ROOT ${install_dir} ) set( BOOST_BINARY_ROOT ${binary_dir} ) hipFFT-rocm-6.4.3/deps/external-gtest.cmake000066400000000000000000000115061501537340400205120ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# message( STATUS "Configuring gtest external dependency" ) include( ExternalProject ) # set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=/package ) set( PREFIX_GTEST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=${PREFIX_GTEST} ) append_cmake_cli_arguments( gtest_cmake_args gtest_cmake_args ) set( gtest_git_repository "https://github.com/google/googletest.git" CACHE STRING "URL to download gtest from" ) set( gtest_git_tag "release-1.8.0" CACHE STRING "URL to download gtest from" ) if( MSVC ) list( APPEND gtest_cmake_args -Dgtest_force_shared_crt=ON -DCMAKE_DEBUG_POSTFIX=d ) # else( ) # GTEST_USE_OWN_TR1_TUPLE necessary to compile with hipcc # list( APPEND gtest_cmake_args -DGTEST_USE_OWN_TR1_TUPLE=1 ) endif( ) if( CMAKE_CONFIGURATION_TYPES ) set( gtest_make COMMAND ${CMAKE_COMMAND} --build --config Release COMMAND ${CMAKE_COMMAND} --build --config Debug ) else( ) # Add build thread in addition to the number of cores that we have include( ProcessorCount ) ProcessorCount( Cores ) # If we are not using an IDE, assume nmake with visual studio if( MSVC ) set( gtest_make "nmake" ) else( ) set( gtest_make "make" ) # The -j paramter does not work with nmake if( NOT Cores EQUAL 0 ) math( EXPR Cores "${Cores} + 1 " ) list( APPEND gtest_make -j ${Cores} ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread list( APPEND gtest_make -j 2 ) endif( ) endif( ) message( STATUS "ExternalGmock using ( " ${Cores} " ) cores to build with" ) endif( ) # message( STATUS "gtest_make ( " ${gtest_make} " ) " ) # message( STATUS "gtest_cmake_args ( " ${gtest_cmake_args} " ) " ) # Master branch has a new structure that combines googletest with googlemock ExternalProject_Add( googletest PREFIX ${CMAKE_BINARY_DIR}/gtest GIT_REPOSITORY ${gtest_git_repository} GIT_TAG ${gtest_git_tag} CMAKE_ARGS ${gtest_cmake_args} BUILD_COMMAND ${gtest_make} LOG_BUILD 1 INSTALL_COMMAND "" LOG_INSTALL 1 ) ExternalProject_Get_Property( googletest source_dir ) # For visual studio, the path 'debug' is hardcoded because that is the default VS configuration for a build. # Doesn't matter if its the gtest or gtestd project above set( package_dir "${PREFIX_GTEST}" ) if( CMAKE_CONFIGURATION_TYPES ) # Create a package by bundling libraries and header files if( BUILD_64 ) set( LIB_DIR lib64 ) else( ) set( LIB_DIR lib ) endif( ) set( gtest_lib_dir "/${LIB_DIR}" ) ExternalProject_Add_Step( googletest createPackage COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Debug ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Release ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Debug ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Release ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory /include ${package_dir}/include COMMAND ${CMAKE_COMMAND} -E copy_directory /gtest/include/gtest ${package_dir}/include/gtest DEPENDEES install ) endif( ) set_property( TARGET googletest PROPERTY FOLDER "extern") ExternalProject_Get_Property( googletest install_dir ) ExternalProject_Get_Property( googletest binary_dir ) # For use by the user of ExternalGtest.cmake set( GTEST_INSTALL_ROOT ${install_dir} ) set( GTEST_BINARY_ROOT ${binary_dir} ) hipFFT-rocm-6.4.3/docs/000077500000000000000000000000001501537340400145345ustar00rootroot00000000000000hipFFT-rocm-6.4.3/docs/.gitignore000066400000000000000000000000621501537340400165220ustar00rootroot00000000000000_doxygen/ doxygen/html/ doxygen/rtf/ doxygen/xml/ hipFFT-rocm-6.4.3/docs/conceptual/000077500000000000000000000000001501537340400166715ustar00rootroot00000000000000hipFFT-rocm-6.4.3/docs/conceptual/overview.rst000066400000000000000000000533761501537340400213070ustar00rootroot00000000000000.. meta:: :description: hipFFT documentation and API reference library :keywords: FFT, hipFFT, rocFFT, ROCm, API, documentation .. _hipfft-overview: ******************************************************************** hipFFT overview ******************************************************************** hipFFT is a GPU FFT marshalling library that supports either :doc:`rocFFT ` or NVIDIA CUDA `cuFFT`_ as the backend. hipFFT exports an interface that does not require the client to change, regardless of the chosen backend. It sits between the application and the backend FFT library, marshalling inputs into the backend and results back to the application. ===================== Basic hipFFT usage ===================== To use hipFFT, follow this step-by-step process: #. Create a transform plan for the FFT. To create a plan, use the functions :cpp:func:`hipfftPlan1d`, :cpp:func:`hipfftPlan2d`, or :cpp:func:`hipfftPlan3d`, depending on the dimensions of the FFT. For a 1D FFT, use the following code: .. code-block:: cpp hipfftHandle plan; hipfftPlan1d(&plan, N, HIPFFT_C2C, 1); For higher-dimension plans, use :cpp:func:`hipfftPlan2d` or :cpp:func:`hipfftPlan3d`. #. Allocate a work buffer (optional) hipFFT generally handles memory allocation internally, so work buffers aren't explicitly required. However, to manually manage memory, you can still allocate buffers before execution. You might want to do this, for example, if you have multiple plans that need work buffers and you want them to share a single buffer. Otherwise, each plan will allocate its own work memory, which might be wasteful. #. Execute the plan To execute the FFT computation, use :cpp:func:`hipfftExecC2C`, :cpp:func:`hipfftExecR2C`, or :cpp:func:`hipfftExecC2R`, depending on the type of transform. You can reuse the same plan for multiple executions, changing the data pointers as necessary. .. code-block:: cpp hipfftExecC2C(plan, x, x, HIPFFT_FORWARD); #. Destroy the plan After you are done with the plan, destroy it to free the associated resources: .. code-block:: cpp hipfftDestroy(plan); #. Free any device memory (if applicable) If you allocated any buffers for storing input/output data or intermediate results, free them using ``hipFree``: .. code-block:: cpp hipFree(x); #. Terminate the library No specific cleanup function is required for hipFFT, but ensure that any HIP memory is freed and the HIP runtime is cleaned up appropriately after all computations are done. The following code sample illustrates how to apply these steps: .. code-block:: cpp #include #include #include "hip/hip_runtime_api.h" #include "hip/hip_vector_types.h" #include "hipfft/hipfft.h" int main() { hipfftHandle plan; size_t N = 16; size_t Nbytes = N * sizeof(hipfftComplex); // Create HIP device buffer hipfftComplex *x; hipMalloc(&x, Nbytes); // Initialize data std::vector cx(N); for (size_t i = 0; i < N; i++) { cx[i].x = 1; cx[i].y = -1; } // Copy data to device hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); // Create hipFFT plan hipfftPlan1d(&plan, N, HIPFFT_C2C, 1); // Execute plan hipfftExecC2C(plan, x, x, HIPFFT_FORWARD); // Wait for execution to finish hipDeviceSynchronize(); // Copy result back to host std::vector y(N); hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost); // Print results for (size_t i = 0; i < N; i++) { std::cout << y[i].x << ", " << y[i].y << std::endl; } // Free device buffer hipFree(x); // Destroy plan hipfftDestroy(plan); return 0; } ======================== Advanced hipFFT usage ======================== For transforms that require advanced input layouts, use the :cpp:func:`hipfftPlanMany` function, setting these parameters: * ``int rank``: The number of dimensions for the FFT (1D, 2D, or 3D). * ``int* n``: Array specifying the size of the FFT in each dimension. * ``int* inembed``: The dimensions of the input data layout in memory. * ``int istride``: Stride between elements in the input array. * ``int idist``: Distance between consecutive FFTs in the input array. * ``int* onembed``: The dimensions of the output data layout in memory. * ``int ostride``: Stride between elements in the output array. * ``int odist``: Distance between consecutive FFTs in the output array. * ``hipfftType type``: Type of FFT (for example, ``HIPFFT_C2C`` or ``HIPFFT_R2C``). * ``int batch``: Number of FFTs to compute in parallel. Here's an example of a 2D single-precision real-to-complex transform using the hipFFT advanced interface: .. code-block:: cpp #include #include #include #include #include int main() { // Define the parameters for the 2D FFT int rank = 2; // Rank of the transform (2D FFT) int n[2] = {4, 5}; // Dimensions of the FFT (4 rows, 5 columns) int howmany = 3; // Number of transforms to compute (batch size) // Derived parameters for handling real-to-complex output int n1_complex_elements = n[1] / 2 + 1; // Number of complex elements in the last dimension int n1_padding_real_elements = n1_complex_elements * 2; // Adjusted real elements to account for padding // Strides and distances int istride = 1; // Stride between elements in input int ostride = istride; // Stride between elements in output int inembed[2] = {istride * n[0], istride * n1_padding_real_elements}; // Input layout int onembed[2] = {ostride * n[0], ostride * n1_complex_elements}; // Output layout int idist = inembed[0] * inembed[1]; // Distance between batches in input int odist = onembed[0] * onembed[1]; // Distance between batches in output // Print the layout parameters std::cout << "n: " << n[0] << " " << n[1] << "\n" << "howmany: " << howmany << "\n" << "istride: " << istride << "\tostride: " << ostride << "\n" << "inembed: " << inembed[0] << " " << inembed[1] << "\n" << "onembed: " << onembed[0] << " " << onembed[1] << "\n" << "idist: " << idist << "\todist: " << odist << "\n" << std::endl; // Initialize input data std::vector data(howmany * idist); // Allocate space for batched input const auto total_bytes = data.size() * sizeof(decltype(data)::value_type); std::cout << "input:\n"; std::fill(data.begin(), data.end(), 0.0); // Fill data with zeros for(int ibatch = 0; ibatch < howmany; ++ibatch) { for(int i = 0; i < n[0]; i++) // Loop over rows { for(int j = 0; j < n[1]; j++) // Loop over columns { // Calculate the position in the input array const auto pos = ibatch * idist + istride * (i * inembed[1] + j); data[pos] = i + ibatch + j; // Populate data with unique values for clarity } } } // Print the input data for each batch for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < inembed[0]; i++) { for(int j = 0; j < inembed[1]; j++) { const auto pos = ibatch * idist + i * inembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; // Create the hipFFT plan for batched 2D real-to-complex transforms hipfftHandle hipForwardPlan; hipfftResult hipfft_rt = hipfftPlanMany(&hipForwardPlan, rank, n, inembed, istride, idist, onembed, ostride, odist, HIPFFT_R2C, // Transform type (real-to-complex) howmany); // Number of transforms in the batch if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); // Allocate GPU memory for input and output hipfftReal* gpu_data; hipError_t hip_rt = hipMalloc((void**)&gpu_data, total_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Copy input data to the GPU hip_rt = hipMemcpy(gpu_data, (void*)data.data(), total_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Execute the FFT on the GPU hipfft_rt = hipfftExecR2C(hipForwardPlan, gpu_data, (hipfftComplex*)gpu_data); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to execute plan"); // Copy the output data back to the host hip_rt = hipMemcpy((void*)data.data(), gpu_data, total_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Display the output data std::cout << "output:\n"; const std::complex* output = (std::complex*)data.data(); for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < onembed[0]; i++) { for(int j = 0; j < onembed[1]; j++) { const auto pos = ibatch * odist + i * onembed[1] + j; std::cout << output[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; // Clean up resources hipfftDestroy(hipForwardPlan); // Destroy the FFT plan hip_rt = hipFree(gpu_data); // Free the GPU memory if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } ====================== Overlapping input data ====================== There are signal processing tasks, such as sliding window FFTs, where overlapping data can improve computational efficiency. Care must be taken to ensure proper memory management and alignment when using overlapping input layouts. The following example demonstrates the use of overlapping input data by configuring the ``inembed``, ``istride``, and ``idist`` parameters in the :cpp:func:`hipfftMakePlanMany` function. Set these parameters to create a memory layout where portions of the input data are reused across multiple FFT batches: * ``inembed`` specifies the physical layout of the input data in memory, with extra padding to accommodate overlapping rows (for example, ``2240``). * ``istride`` ensures continuous reading of data within each row (if set to ``1``). * ``idist`` defines the distance between the starting points of consecutive batches (for example, ``432``), which is smaller than the total memory implied by ``xformSz`` and ``inembed``. .. code-block:: cpp #include #include #include #include #include #include #include int main() { std::cout << "hipFFT 2D batched complex-to-complex transform example\n"; // FFT configuration int rank = 2; int xformSz[2] = {512, 512}; // 2D FFT size: 512x512 int inEmbed[2] = {1, 2240}; // Input data layout int onEmbed[2] = {1, 512}; // Output data layout int istride = 1, ostride = 1; // Stride for input and output int idist = 432, odist = 262144; // Batch distance for input and output int batch = 5; // Number of FFTs to compute in parallel // Calculate input and output sizes in bytes size_t inSize = idist * batch * sizeof(std::complex); size_t outSize = odist * batch * sizeof(std::complex); // Initialize HIP and hipFFT resources hipSetDevice(0); hipStream_t stream; hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); hipfftHandle handleF; if (hipfftPlanMany(&handleF, rank, xformSz, inEmbed, istride, idist, onEmbed, ostride, odist, HIPFFT_C2C, batch) != HIPFFT_SUCCESS) { std::cerr << "Failed to create hipFFT plan" << std::endl; return EXIT_FAILURE; } hipfftSetStream(handleF, stream); // Allocate device memory std::complex* miTD; // Input buffer std::complex* miFD; // Output buffer if (hipMalloc(&miTD, inSize) != hipSuccess || hipMalloc(&miFD, outSize) != hipSuccess) { std::cerr << "hipMalloc failed" << std::endl; return EXIT_FAILURE; } // Initialize input data on the host std::vector> inputData(idist * batch, {0.0f, 0.0f}); for (int ibatch = 0; ibatch < batch; ++ibatch) { for (int i = 0; i < xformSz[0]; ++i) { for (int j = 0; j < xformSz[1]; ++j) { int pos = ibatch * idist + i * inEmbed[1] + j; inputData[pos] = std::complex(i + j, ibatch); } } } // Copy input data to device if (hipMemcpy(miTD, inputData.data(), inSize, hipMemcpyHostToDevice) != hipSuccess) { std::cerr << "hipMemcpy failed" << std::endl; return EXIT_FAILURE; } // Execute FFT if (hipfftExecC2C(handleF, reinterpret_cast(miTD), reinterpret_cast(miFD), HIPFFT_FORWARD) != HIPFFT_SUCCESS) { std::cerr << "Failed to execute hipFFT" << std::endl; return EXIT_FAILURE; } // Synchronize stream hipStreamSynchronize(stream); // Copy results back to host std::vector> outputData(odist * batch); if (hipMemcpy(outputData.data(), miFD, outSize, hipMemcpyDeviceToHost) != hipSuccess) { std::cerr << "hipMemcpy failed" << std::endl; return EXIT_FAILURE; } // Display results std::cout << "Output data:\n"; for (int ibatch = 0; ibatch < batch; ++ibatch) { std::cout << "Batch " << ibatch << ":\n"; for (int i = 0; i < xformSz[0]; ++i) { for (int j = 0; j < xformSz[1] / 2 + 1; ++j) { int pos = ibatch * odist + i * onEmbed[1] + j; std::cout << outputData[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } // Clean up resources hipfftDestroy(handleF); hipStreamDestroy(stream); hipFree(miTD); hipFree(miFD); return EXIT_SUCCESS; } ================= Multi-GPU example ================= The following example demonstrates a multi-GPU 2D double-precision complex-to-complex transform using the hipFFT library. It showcases how to perform a 2D Fast Fourier Transform (FFT) in double precision (complex-to-complex) across two GPUs. The following concepts and API calls are used: * ``hipfftXt``: This API lets users execute FFTs across multiple GPUs by managing multi-GPU plans. ``hipfftXt`` provides an extended version of the hipFFT API to handle GPU-specific operations, such as memory allocation and execution across multiple devices. For more details, see the :doc:`API reference <../reference/fft-api-usage>`. * :cpp:func:`hipfftCreate`: Creates a hipFFT plan that contains the FFT configuration. This plan is used to configure the FFT transform operation. * ``hipStreamCreate``: Creates a stream for managing GPU work concurrently. This enables execution of the multi-GPU plan in parallel on multiple GPUs. For more details, see :doc:`HIP `. * :cpp:func:`hipfftXtSetGPUs`: Assigns the GPUs (in this case, two GPUs) to the hipFFT plan, enabling multi-GPU computation for the FFT. * :cpp:func:`hipfftMakePlan2d`: Creates a 2D FFT plan for the specified input/output size (``Nx``, ``Ny``), specifying the transform type (complex-to-complex in this case). * :cpp:func:`hipfftXtMalloc`: Allocates memory on the GPUs for storing the FFT input and output data. * :cpp:func:`hipfftXtMemcpy`: Copies data between the host and GPU memory, supporting both host-to-device and device-to-host operations. * :cpp:func:`hipfftXtExecDescriptor`: Executes the FFT operation based on the input descriptor ``desc``, which holds the input data and transform configuration. * :cpp:func:`hipfftXtFree`: Frees the memory allocated for the input/output descriptors after the computation is completed. For detailed API usage, see :ref:`hipfft-api-usage`. .. code-block:: cpp #include #include #include #include #include #include "../hipfft_params.h" int main() { // Define FFT dimensions const int Nx = 512; const int Ny = 512; int direction = HIPFFT_FORWARD; // forward = -1, backward = 1 // Initialize input data (complex numbers) for FFT computation int verbose = 0; std::vector> cinput(Nx * Ny); for(size_t i = 0; i < Nx * Ny; i++) { cinput[i] = i; // Initialize the data with some values } // Optionally, print the input data if(verbose) { std::cout << "Input:\n"; for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { int pos = i * Ny + j; std::cout << cinput[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; } // Specify the GPUs you want to use for multi-GPU setup std::array gpus = {0, 1}; // Use GPU 0 and GPU 1 // Create a multi-GPU plan hipLibXtDesc* desc; // Input descriptor for the Xt format hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; // Initialize plan handle // Create the FFT plan if(hipfftCreate(&plan) != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); // Create a GPU stream and assign it to the plan for asynchronous operations hipStream_t stream{}; if(hipStreamCreate(&stream) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); if(hipfftSetStream(plan, stream) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetStream failed."); // Assign GPUs to the plan (this is where multi-GPU is specified) hipfftResult hipfft_rt = hipfftXtSetGPUs(plan, gpus.size(), gpus.data()); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtSetGPUs failed."); // Make the 2D plan for FFT (this defines the 2D FFT using the specified dimensions) size_t workSize[gpus.size()]; hipfft_rt = hipfftMakePlan2d(plan, Nx, Ny, HIPFFT_Z2Z, workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftMakePlan2d failed."); // Allocate memory for input data on the GPUs (Xt format handles the data distribution) hipfftXtSubFormat_t format = HIPFFT_XT_FORMAT_INPLACE_SHUFFLED; hipfft_rt = hipfftXtMalloc(plan, &desc, format); // Allocate memory for the descriptor if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed."); // Copy the input data to the GPUs (device memory) hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(desc), reinterpret_cast(cinput.data()), HIPFFT_COPY_HOST_TO_DEVICE); // Copy from host to device if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed."); // Execute the FFT computation using the Xt descriptor hipfft_rt = hipfftXtExecDescriptor(plan, desc, desc, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtExecDescriptor failed."); // Optionally, print the output data (copy the results back to the host) if(verbose) { // Copy the output data back to the host hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(cinput.data()), reinterpret_cast(desc), HIPFFT_COPY_DEVICE_TO_HOST); // Copy from device to host if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy D2H failed."); std::cout << "Output:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * Ny + j; std::cout << cinput[pos] << " "; // Print the output FFT results } std::cout << "\n"; } std::cout << std::endl; } // Clean up memory and resources if(hipfftXtFree(desc) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtFree failed."); if(hipfftDestroy(plan) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftDestroy failed."); if(hipStreamDestroy(stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); return 0; } .. _rocFFT: https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html .. _cuFFT: https://developer.nvidia.com/cufft hipFFT-rocm-6.4.3/docs/conf.py000066400000000000000000000021401501537340400160300ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html import re from rocm_docs import ROCmDocs with open('../CMakeLists.txt', encoding='utf-8') as f: match = re.search(r'set\( VERSION_STRING \"?([0-9.]+)[^0-9.]+', f.read()) if not match: raise ValueError("VERSION not found!") version_number = match[1] left_nav_title = f"hipFFT {version_number} Documentation" # for PDF output on Read the Docs project = "hipFFT Documentation" author = "Advanced Micro Devices, Inc." copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." version = version_number release = version_number external_toc_path = "./sphinx/_toc.yml" docs_core = ROCmDocs(left_nav_title) docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() external_projects_current_project = "hipfft" for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) hipFFT-rocm-6.4.3/docs/doxygen/000077500000000000000000000000001501537340400162115ustar00rootroot00000000000000hipFFT-rocm-6.4.3/docs/doxygen/Doxyfile000066400000000000000000003221051501537340400177220ustar00rootroot00000000000000# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "hipFFT" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = v1.0.18 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HIP" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = ../../library/include/hipfft # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = YES # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- SHOW_NAMESPACES = NO # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but # at the end of the doxygen process doxygen will return with a non-zero status. # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = YES #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../library/include/hipfft # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.f90 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf \ *.as \ *.js # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # compiled with the --with-libclang option. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 1 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /