pax_global_header 0000666 0000000 0000000 00000000064 15015373413 0014514 g ustar 00root root 0000000 0000000 52 comment=d790d3ed379830da01564df1762a6d7c94eee965
rocFFT-rocm-6.4.3/ 0000775 0000000 0000000 00000000000 15015373413 0013607 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/.azuredevops/ 0000775 0000000 0000000 00000000000 15015373413 0016234 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/.azuredevops/rocm-ci.yml 0000664 0000000 0000000 00000001240 15015373413 0020305 0 ustar 00root root 0000000 0000000 resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger:
batch: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .githooks
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
pr:
autoCancel: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .githooks
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
drafts: false
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rocFFT.yml@pipelines_repo
rocFFT-rocm-6.4.3/.clang-format 0000664 0000000 0000000 00000006542 15015373413 0016171 0 ustar 00root root 0000000 0000000 # Style file for MLSE Libraries based on the modified rocBLAS style
# Common settings
BasedOnStyle: WebKit
TabWidth: 4
IndentWidth: 4
UseTab: Never
ColumnLimit: 100
# Other languages JavaScript, Proto
---
Language: Cpp
# http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code
# int formatted_code;
# // clang-format off
# void unformatted_code ;
# // clang-format on
# void formatted_code_again;
DisableFormat: false
Standard: Cpp11
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: true
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
# Configure each individual brace in BraceWrapping
BreakBeforeBraces: Custom
# Control of individual brace wrapping cases
BraceWrapping: {
AfterCaseLabel: 'true'
AfterClass: 'true'
AfterControlStatement: 'true'
AfterEnum : 'true'
AfterFunction : 'true'
AfterNamespace : 'true'
AfterStruct : 'true'
AfterUnion : 'true'
BeforeCatch : 'true'
BeforeElse : 'true'
IndentBraces : 'false'
# AfterExternBlock : 'true'
}
#BreakAfterJavaFieldAnnotations: true
#BreakBeforeInheritanceComma: false
#BreakBeforeBinaryOperators: None
#BreakBeforeTernaryOperators: true
#BreakConstructorInitializersBeforeComma: true
#BreakStringLiterals: true
CommentPragmas: '^ IWYU pragma:'
#CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
SpaceBeforeCpp11BracedList: false
DerivePointerAlignment: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IndentCaseLabels: false
IndentPPDirectives: None
#FixNamespaceComments: true
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
#JavaScriptQuotes: Double
MaxEmptyLinesToKeep: 1
NamespaceIndentation: All
ObjCBlockIndentWidth: 4
#ObjCSpaceAfterProperty: true
#ObjCSpaceBeforeProtocolList: true
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: Never
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
#SpaceAfterTemplateKeyword: true
#SpaceBeforeInheritanceColon: true
#SortUsingDeclarations: true
SortIncludes: true
# Comments are for developers, they should arrange them
ReflowComments: false
#IncludeBlocks: Preserve
---
rocFFT-rocm-6.4.3/.githooks/ 0000775 0000000 0000000 00000000000 15015373413 0015514 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/.githooks/install 0000775 0000000 0000000 00000000222 15015373413 0017104 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
cd $(git rev-parse --git-dir)
cd hooks
echo "Installing hooks..."
ln -s ../../.githooks/pre-commit pre-commit
echo "Done!"
rocFFT-rocm-6.4.3/.githooks/pre-commit 0000775 0000000 0000000 00000001767 15015373413 0017531 0 ustar 00root root 0000000 0000000 #!/bin/sh
#
# This pre-commit hook checks if any versions of clang-format
# are installed, and if so, uses the installed version to format
# the staged changes.
base=/opt/rocm/llvm/bin/clang-format
format=""
# Redirect output to stderr.
exec 1>&2
# check if clang-format is installed
type "$base" >/dev/null 2>&1 && format="$base"
# no versions of clang-format are installed
if [ -z "$format" ]
then
echo "$base is not installed. Pre-commit hook will not be executed."
exit 0
fi
# Do everything from top - level
cd $(git rev-parse --show-toplevel)
if git rev-parse --verify HEAD >/dev/null 2>&1
then
against=HEAD
else
# Initial commit: diff against an empty tree object
against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
fi
# do the formatting
for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$')
do
if [ -e "$file" ]
then
echo "$format $file"
"$format" -i -style=file "$file"
fi
done
rocFFT-rocm-6.4.3/.github/ 0000775 0000000 0000000 00000000000 15015373413 0015147 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/.github/CODEOWNERS 0000775 0000000 0000000 00000000576 15015373413 0016555 0 ustar 00root root 0000000 0000000 * @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts
# Documentation files
docs/ @ROCm/rocm-documentation
*.md @ROCm/rocm-documentation
*.rst @ROCm/rocm-documentation
.readthedocs.yaml @ROCm/rocm-documentation
# Header directory for Doxygen documentation
library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @feizheng10 @malcolmroberts
rocFFT-rocm-6.4.3/.github/CONTRIBUTING.md 0000664 0000000 0000000 00000014633 15015373413 0017407 0 ustar 00root root 0000000 0000000
# Contributing to rocFFT #
We welcome contributions to rocFFT. Please follow these details to help ensure your contributions will be successfully accepted.
## Issue Discussion ##
Please use the GitHub Issues tab to notify us of issues.
* Use your best judgment for issue creation. If your issue is already listed, upvote the issue and
comment or post to provide additional details, such as how you reproduced this issue.
* If you're not sure if your issue is the same, err on the side of caution and file your issue.
You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
your issue as being the same as the existing issue, we'll close the duplicate.
* If your issue doesn't exist, use the issue template to file a new issue.
* When filing an issue, be sure to provide as much information as possible, including script output so
we can collect information about your configuration. This helps reduce the time required to
reproduce your issue.
* Check your issue regularly, as we may require additional information to successfully reproduce the
issue.
* You may also open an issue to ask questions to the maintainers about whether a proposed change
meets the acceptance criteria, or to discuss an idea pertaining to the library.
## Acceptance Criteria ##
When a contribution is submitted via a pull request, a number of automated checks are run in order to verify compilation correctness and prevent performance regressions.
These checks include:
* Building and testing the change on various OS platforms (Ubuntu, RHEL, etc.)
* Running on different GPU architectures (MI-series, Radeon series cards, etc.)
* Running benchmarks to check for performance degradation
In order for a submission to be accepted:
* It must pass all of the automated checks
* It must undergo a code review
Users can visualize our continuous integration infrastructure in: `rocFFT/.jenkins`.
The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes before raising pull requests.
## Code Structure ##
In a broad view, rocFFT library is structured as follows:
├── docs/: contains rocFFT documentation
├── library/: contains main source code and headers
├── clients/:
│ ├── bench/ : contains benchmarking code
│ ├── samples/ : contains examples
│ ├── tests/ : contains our test infrastructure
├── shared/: contains important global headers and those for linking to other applications
## Coding Style ##
* All public APIs are C89 compatible; all other library code should use c++17.
* Our minimum supported compiler is clang 3.6.
* Avoid CamelCase: rule applies specifically to publicly visible APIs, but is encouraged (not mandated) for internal code.
* C and C++ code should be formatted using `clang-format`. You can use the clang-format version available in `rocFFT/.clang-format`.
To format a C/C++ file, use:
```
clang-format -style=file -i
```
* Python code should use:
```
yapf --style pep8
```
## Pull Request Guidelines ##
Our code contribution guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/).
This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code.
Note that a [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.
The following guidelines apply:
* When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch.
* Note that releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers.
* Ensure code builds successfully.
* Do not break existing test cases
* Code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit.
### Deliverables ###
New changes should include test coverage. Our testing infrastructure is located in `clients/tests/`, and can be used as a reference.
The following guidelines apply:
* New functionality will only be merged with new unit tests.
* New unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
* Tests must have good code coverage.
### Process ###
All pull requests must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged.
Once a contribution is ready to be submitted, consider the following:
* Before you create a PR, ensure that all files have been gone through the clang formatting: clang-format -i
* While creating a PR, you can take a look at a `diff` of the changes you made using the PR's "Files" tab, and verify that no unintentional changes are being submitted.
* Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails.
* During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas.
* When a modification request has been completed, the conversation thread about it will be marked as resolved.
* To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
* Once your contribution is approved, we will use the *squash merge* option from GitHub to integrate it to the corresponding branch.
## Code License ##
All code contributed to this project will be licensed under the license identified in the [LICENSE.md](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md). Your contribution will be accepted under the same license.
rocFFT-rocm-6.4.3/.github/ISSUE_TEMPLATE.md 0000664 0000000 0000000 00000000461 15015373413 0017655 0 ustar 00root root 0000000 0000000 ### What is the expected behavior
-
### What actually happens
-
### How to reproduce
-
### Environment
| Hardware | description |
|-----|-----|
| GPU | device string |
| CPU | device string |
| Software | version |
|-----|-----|
| ROCK | v0.0 |
| ROCR | v0.0 |
| HCC | v0.0 |
| Library | v0.0 |
rocFFT-rocm-6.4.3/.github/PULL_REQUEST_TEMPLATE.md 0000664 0000000 0000000 00000000070 15015373413 0020745 0 ustar 00root root 0000000 0000000 resolves #___
Summary of proposed changes:
-
-
-
rocFFT-rocm-6.4.3/.github/dependabot.yml 0000664 0000000 0000000 00000001223 15015373413 0017775 0 ustar 00root root 0000000 0000000 # To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
labels:
- "documentation"
- "dependencies"
- "ci:docs-only"
reviewers:
- "samjwu"
rocFFT-rocm-6.4.3/.gitignore 0000664 0000000 0000000 00000000554 15015373413 0015603 0 ustar 00root root 0000000 0000000 # Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
# vim tags
tags
.tags
.*.swp
# Visual Studio Code
.vscode
# install.sh build dir
build/
# python bytecode
__pycache__
rocFFT-rocm-6.4.3/.jenkins/ 0000775 0000000 0000000 00000000000 15015373413 0015326 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/.jenkins/application.groovy 0000664 0000000 0000000 00000024745 15015373413 0021114 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'application')
prj.defaults.ccache = true
prj.timeout.compile = 600
prj.timeout.test = 600
prj.libraryDependencies = ['rocFFT', 'hipFFT']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
def commonGroovy
def compileCommand =
{
platform, project->
def getDependenciesCommand = ""
if (project.installLibraryDependenciesFromCI)
{
project.libraryDependencies.each
{ libraryName ->
getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
}
}
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}
${getDependenciesCommand}
git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git
cd Gromacs
mkdir build_tmpi
cd build_tmpi
cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install ..
make
make install
cd ..
mkdir build_mpi
cd build_mpi
cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install ..
make
make install
cd ..
"""
platform.runCommand(this, command)
}
def testCommand =
{
platform, project->
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}
cd Gromacs
source gromacs-install/bin/GMXRC
gmx --version
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib
echo \$LD_LIBRARY_PATH
git clone https://github.com/jychang48/benchmark-gromacs.git
cd benchmark-gromacs
export GMX_MAXBACKUP=-1
echo "* Threaded MPI ******************************************************************************************************"
#ADH_DODEC
cd adh_dodec
tar zxf adh_dodec.tar.gz
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs
# STMV
cd ..
cd stmv/
tar zxf stmv.tar.gz
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200 # 1 GPU
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400 # 4 GPUs
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs
# CELLULOSE_NVE
cd ..
cd cellulose_nve/
tar zxf cellulose_nve.tar.gz
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs
gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs
echo "* MPI ***************************************************************************************************************"
# ADH_DODEC
cd ..
cd adh_dodec/
tar zxf adh_dodec.tar.gz
mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU
mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs
mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs
mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs
# STMV
cd ..
cd stmv/
tar zxf stmv.tar.gz
mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr # 1 GPU
mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs
mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs
mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs
# CELLULOSE_NVE
cd ..
cd cellulose_nve/
tar zxf cellulose_nve.tar.gz
mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU
mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs
mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs
mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs
"""
platform.runCommand(this, command)
}
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
// For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
if(!jobNameList.keySet().contains(urlJobName))
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu18:['8gfx90a']], urlJobName)
}
}
}
rocFFT-rocm-6.4.3/.jenkins/common.groovy 0000664 0000000 0000000 00000014545 15015373413 0020076 0 ustar 00root root 0000000 0000000 // This file is for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false, boolean buildMPI=false)
{
project.paths.construct_build_prefix()
def getDependenciesCommand = ""
if (project.installLibraryDependenciesFromCI)
{
project.libraryDependencies.each
{ libraryName ->
getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
}
}
String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON'
String warningArgs = '-DWERROR=ON'
String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON'
String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
String buildTypeDir = debug ? 'debug' : 'release'
String buildMPIArgs = buildMPI ? '-DCMAKE_PREFIX_PATH=/usr/local/openmpi -DROCFFT_MPI_ENABLE=ON' : ''
String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : ''
String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
//Set CI node's gfx arch as target if PR, otherwise use default targets of the library
String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}
${getDependenciesCommand}
set -e
mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
${auxiliary.gfxTargetParser()}
${cmake} ${buildMPIArgs} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../..
make -j\$(nproc)
sudo make install
"""
platform.runCommand(this, command)
}
def runCompileClientCommand(platform, project, jobName, boolean debug=false)
{
project.paths.construct_build_prefix()
String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON'
String warningArgs = '-DWERROR=ON'
String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
String buildTypeArgClients = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release'
String cmakePrefixPathArg = "-DCMAKE_PREFIX_PATH=${project.paths.project_build_prefix}"
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}/clients
mkdir -p build && cd build
${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArgClients} ${clientArgs} ${warningArgs} ${cmakePrefixPathArg} ${amdgpuTargets} ../
make -j\$(nproc)
"""
platform.runCommand(this, command)
}
def runTestCommand (platform, project, boolean debug=false, gfilter='', extraArgs='')
{
String testBinaryName = 'rocfft-test'
String directory = debug ? 'debug' : 'release'
String gfilterArg = ''
if (gfilter)
{
gfilterArg = "--gtest_filter=${gfilter}"
}
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging
ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} --gtest_color=yes --R 80 --nrand 10 ${extraArgs}
"""
platform.runCommand(this, command)
}
def runPackageCommand(platform, project, jobName, boolean debug=false)
{
String directory = debug ? 'debug' : 'release'
def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false)
platform.runCommand(this, packageHelper[0])
platform.archiveArtifacts(this, packageHelper[1])
//trim temp files
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}/build/${directory}/
rm -rf _CPack_Packages/
find -name '*.o' -delete
"""
platform.runCommand(this, command)
}
def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble)
{
project.paths.construct_build_prefix()
// Don't build clients, since we're just testing if the library can build
String clientArgs = ''
String warningArgs = '-DWERROR=ON'
String buildTypeArg = '-DCMAKE_BUILD_TYPE=Release'
String buildTypeDir = 'release'
String genPatternArgs = "-DGENERATOR_PATTERN=${genPattern}"
String manualSmallArgs = (genSmall != null) ? "-DGENERATOR_MANUAL_SMALL_SIZE=${genSmall}" : ''
String manualLargeArgs = (genLarge != null) ? "-DGENERATOR_MANUAL_LARGE_SIZE=${genLarge}" : ''
String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : ''
String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}"
String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
//Set CI node's gfx arch as target if PR, otherwise use default targets of the library
String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : ''
String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}
rm -rf build/${buildTypeDir}
mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir}
${auxiliary.gfxTargetParser()}
${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../..
make -j\$(nproc)
"""
platform.runCommand(this, command)
}
return this
rocFFT-rocm-6.4.3/.jenkins/debug.groovy 0000664 0000000 0000000 00000004640 15015373413 0017667 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'Debug')
prj.defaults.ccache = true
prj.timeout.compile = 600
prj.timeout.test = 600
prj.libraryDependencies = ['rocRAND','hipRAND']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
def commonGroovy
def compileCommand =
{
platform, project->
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
commonGroovy.runCompileCommand(platform, project, jobName, true)
}
def testCommand =
{
platform, project->
commonGroovy.runTestCommand(platform, project, true)
}
def packageCommand =
{
platform, project->
commonGroovy.runPackageCommand(platform, project, jobName, true)
}
buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
// For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
if(!jobNameList.keySet().contains(urlJobName))
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu16:['any']], urlJobName)
}
}
}
rocFFT-rocm-6.4.3/.jenkins/extended.groovy 0000664 0000000 0000000 00000007020 15015373413 0020374 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runBitwiseReproTest (platform, project, boolean debug=false, gfilter='', reprodb='', int repeat=1)
{
String testBinaryName = 'rocfft-test'
String directory = debug ? 'debug' : 'release'
String gfilterArg = ''
if (gfilter)
{
gfilterArg = "--gtest_filter=${gfilter}"
}
String reproDbArg = ''
if (reprodb)
{
reproDbArg = "--repro-db=${reprodb}"
}
String repeatArg = ''
if (repeat > 1)
{
repeatArg = "--gtest_repeat=${repeat}"
}
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging
ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db ${gfilterArg} ${reproDbArg} ${repeatArg} --gtest_color=yes --R 80 --nrand 10
"""
platform.runCommand(this, command)
}
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'Extended')
prj.defaults.ccache = true
prj.timeout.compile = 600
prj.timeout.test = 600
prj.libraryDependencies = ['rocRAND','hipRAND']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
def commonGroovy
def compileCommand =
{
platform, project->
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
commonGroovy.runCompileCommand(platform, project, jobName)
commonGroovy.runCompileClientCommand(platform, project, jobName, false)
}
def testCommand =
{
platform, project->
runBitwiseReproTest(platform, project, false, "*pow2_1D/bitwise_repro_test*", 'bitwise_repro.db', 2)
}
def packageCommand =
{
platform, project->
commonGroovy.runPackageCommand(platform, project, jobName)
}
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
// For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
if(!jobNameList.keySet().contains(urlJobName))
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu18:['gfx906']], urlJobName)
}
}
}
rocFFT-rocm-6.4.3/.jenkins/multigpu.groovy 0000664 0000000 0000000 00000005541 15015373413 0020450 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'multigpu')
prj.defaults.ccache = true
prj.timeout.compile = 600
prj.timeout.test = 600
prj.libraryDependencies = ['rocRAND','hipRAND']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
def commonGroovy
def compileCommand =
{
platform, project->
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
// build with MPI enabled
commonGroovy.runCompileCommand(platform, project, jobName, false, false, true)
commonGroovy.runCompileClientCommand(platform, project, jobName, false)
}
def testCommand =
{
platform, project->
//run single-process multi-GPU tests
commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*")
// run MPI tests across 4 ranks
commonGroovy.runTestCommand(platform, project, false, "*multi_gpu*", '--mp_lib mpi --mp_ranks 4 --mp_launch "/usr/local/openmpi/bin/mpirun --np 4 ./rocfft_mpi_worker"')
}
def packageCommand =
{
platform, project->
// don't package anything - we're not distributing MPI-enabled rocFFT so we don't want to expose any MPI-enabled packages anywhere that other builds can mistakenly pick up
}
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["main":[pipelineTriggers([cron('0 1 * * 0')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["main":([ubuntu20:['8gfx90a']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
// For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
if(!jobNameList.keySet().contains(urlJobName))
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu20:['8gfx90a']], urlJobName)
}
}
}
rocFFT-rocm-6.4.3/.jenkins/performance.groovy 0000664 0000000 0000000 00000021417 15015373413 0021103 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false)
{
def reference = (env.BRANCH_NAME ==~ /PR-\d+/) ? 'develop' : 'master'
project.paths.construct_build_prefix()
def getDependenciesCommand = ""
if (project.installLibraryDependenciesFromCI)
{
project.libraryDependencies.each
{ libraryName ->
getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false)
}
}
dir("${project.paths.project_build_prefix}/ref-repo") {
git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git'
}
String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_BENCH=ON'
String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_BENCH=OFF'
String warningArgs = '-DWERROR=ON'
String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release'
String buildTypeDir = debug ? 'debug' : 'release'
String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake'
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}
${getDependenciesCommand}
set -e
mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir}
${auxiliary.gfxTargetParser()}
${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../..
make -j\$(nproc)
popd
cd ref-repo
mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir}
${auxiliary.gfxTargetParser()}
${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang -DAMDGPU_TARGETS=\$gfx_arch ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../..
make -j\$(nproc)
"""
platform.runCommand(this, command)
}
def runTestCommand (platform, project, boolean debug=false)
{
String sudo = auxiliary.sudo(platform.jenkinsLabel)
String directory = debug ? 'debug' : 'release'
def dataTypes = ['single', 'double']
for (def dataType in dataTypes)
{
def command = """#!/usr/bin/env bash
set -ex
pwd
cd ${project.paths.project_build_prefix}
export ROCFFT_RTC_CACHE_PATH="\$JENKINS_HOME_LOCAL/rocfft_build_cache.db"
./scripts/perf/rocfft-perf run --bench ./build/${directory}/clients/staging/dyna-rocfft-bench --lib ./ref-repo/build/${directory}/library/src/librocfft.so --lib ./build/${directory}/library/src/librocfft.so --out ./${dataType}_ref --out ./${dataType}_change --device 0 --precision ${dataType} --suite benchmarks
ls ${dataType}_change
ls ${dataType}_ref
mkdir ${dataType}_results
./scripts/perf/rocfft-perf post ./${dataType}_results ./${dataType}_ref ./${dataType}_change
ls ${dataType}_change/*.mdat
./scripts/perf/rocfft-perf html ./${dataType}_results ./${dataType}_ref ./${dataType}_change
mv ${dataType}_results/figs.html ${dataType}_results/figs_${platform.gpu}.html
"""
platform.runCommand(this, command)
archiveArtifacts "${project.paths.project_build_prefix}/${dataType}_results/*.html"
publishHTML([allowMissing: false,
alwaysLinkToLastBuild: false,
keepAll: false,
reportDir: "${project.paths.project_build_prefix}/${dataType}_results",
reportFiles: "figs_${platform.gpu}.html",
reportName: "${dataType}-precision-${platform.gpu}",
reportTitles: "${dataType}-precision-${platform.gpu}"])
}
withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')])
{
platform.runCommand(
this,
"""
cd ${project.paths.build_prefix}
git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.2.0
cd rocPTS
python3 -m pip install build
python3 -m build
python3 -m pip install .
"""
)
}
writeFile(
file: project.paths.project_build_prefix + "/record_pts.py",
text: libraryResource("com/amd/scripts/record_pts.py"))
def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : ""
def command = """#!/usr/bin/env bash
set -ex
cd ${project.paths.project_build_prefix}
${setupBranch}
git checkout \$BRANCH_NAME
benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d)
mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref
cp -uf ./*_change/* \${benchmark_folder}/all_change
cp -uf ./*_ref/* \${benchmark_folder}/all_ref
python3 ./record_pts.py \
--dataset-path \$PWD/\${benchmark_folder} \
--reference-dataset all_ref \
--new-dataset all_change \
--new-build . \
--reference-build ./ref-repo\
-v 5.5 \
-l pts_rocfft_benchmark_data-v1.0.0
"""
withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')])
{
platform.runCommand(this, command)
}
}
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'Performance')
prj.defaults.ccache = true
prj.timeout.compile = 600
prj.timeout.test = 600
prj.libraryDependencies = ['rocRAND','hipRAND']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
def commonGroovy
def gpus = []
def dataTypes = ['single', 'double']
def compileCommand =
{
platform, project->
gpus.add(platform.gpu)
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
runCompileCommand(platform, project, jobName)
}
def testCommand =
{
platform, project->
runTestCommand(platform, project)
}
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null)
def commentString = "Performance reports: \n" + "Commit hashes: \n"
for (parentHash in prj.gitParentHashes) {
commentString += "${parentHash} \n"
}
for (gpu in gpus) {
for (dataType in dataTypes) {
commentString += "[${gpu} ${dataType} report](${JOB_URL}/${dataType}-precision-${gpu})\n"
}
}
if (env.BRANCH_NAME ==~ /PR-\d+/)
{
boolean commentExists = false
for (prComment in pullRequest.comments) {
if (prComment.body.contains("Performance reports:"))
{
commentExists = true
prComment.body = commentString
}
}
if (!commentExists) {
def comment = pullRequest.comment(commentString)
}
}
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900','gfx906']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
// For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
if(!jobNameList.keySet().contains(urlJobName))
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu18:['gfx906']], urlJobName)
}
}
}
rocFFT-rocm-6.4.3/.jenkins/staticanalysis.groovy 0000664 0000000 0000000 00000007236 15015373413 0021640 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCompileCommand(platform, project, jobName, boolean debug=false)
{
project.paths.construct_build_prefix()
def yapfCommand = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}
yapf --version
find . -iname '*.py' \
| grep -v 'build/' \
| xargs -n 1 -P 1 -I{} -t sh -c 'yapf --style pep8 {} | diff - {}'
"""
platform.runCommand(this, yapfCommand)
}
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'StaticAnalysis')
prj.libraryDependencies = ['rocRAND','hipRAND']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = true
boolean staticAnalysis = true
def compileCommand =
{
platform, project->
runCompileCommand(platform, project, jobName, false)
}
buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
def kernelSubsetPrj = new rocProject('rocFFT-internal', 'BuildKernelSubset')
def nodesForPrj2 = new dockerNodes(nodeDetails, jobName, kernelSubsetPrj)
def commonGroovy
def compileSubsetCommand =
{
platform, project->
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
// build pattern pow2,pow7 no manual small and large, dp only
commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'pow2,pow7', null, null, true)
// build large sizes, dp only
commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'large', null, null, true)
// build 2D sizes, dp only
commonGroovy.runSubsetBuildCommand(platform, project, jobName, '2D', null, null, true)
// put an extra unsupported size(10) in manual large to see if it will be filtered correctly
commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', null, '10,50,100,200,336', true)
// put an extra unsupported size(23) in manual small to see if it will be filtered correctly
commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23,1024', '10,50,100,200,336', true)
// all the manual sizes are not supported
//commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23', '10', true)
}
buildProject(kernelSubsetPrj , formatCheck, nodesForPrj2.dockerArray, compileSubsetCommand, null, null)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])],
"rocm-docker":[]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
}
rocFFT-rocm-6.4.3/.jenkins/staticlibrary.groovy 0000664 0000000 0000000 00000004654 15015373413 0021462 0 ustar 00root root 0000000 0000000 #!/usr/bin/env groovy
// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
@Library('rocJenkins@pong') _
// This is file for internal AMD use.
// If you are interested in running your own Jenkins, please raise a github issue for assistance.
import com.amd.project.*
import com.amd.docker.*
import java.nio.file.Path
def runCI =
{
nodeDetails, jobName->
def prj = new rocProject('rocFFT-internal', 'StaticLibrary')
prj.defaults.ccache = true
prj.timeout.compile = 600
prj.timeout.test = 600
prj.libraryDependencies = ['rocRAND','hipRAND']
// Define test architectures, optional rocm version argument is available
def nodes = new dockerNodes(nodeDetails, jobName, prj)
boolean formatCheck = false
def commonGroovy
def compileCommand =
{
platform, project->
commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
commonGroovy.runCompileCommand(platform, project, jobName, false, true)
}
def testCommand =
{
platform, project->
commonGroovy.runTestCommand(platform, project)
}
def packageCommand =
{
platform, project->
commonGroovy.runPackageCommand(platform, project, jobName)
}
buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
}
ci: {
String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
propertyList = auxiliary.appendPropertyList(propertyList)
def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900']])]
jobNameList = auxiliary.appendJobNameList(jobNameList)
propertyList.each
{
jobName, property->
if (urlJobName == jobName)
properties(auxiliary.addCommonProperties(property))
}
jobNameList.each
{
jobName, nodeDetails->
if (urlJobName == jobName)
stage(jobName) {
runCI(nodeDetails, jobName)
}
}
// For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
if(!jobNameList.keySet().contains(urlJobName))
{
properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
stage(urlJobName) {
runCI([ubuntu16:['gfx906']], urlJobName)
}
}
}
rocFFT-rocm-6.4.3/.readthedocs.yaml 0000664 0000000 0000000 00000000572 15015373413 0017042 0 ustar 00root root 0000000 0000000 # Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
sphinx:
configuration: docs/conf.py
formats: [htmlzip, pdf, epub]
python:
install:
- requirements: docs/sphinx/requirements.txt
build:
os: ubuntu-22.04
tools:
python: "mambaforge-22.9"
conda:
environment: docs/environment.yml
rocFFT-rocm-6.4.3/CHANGELOG.md 0000664 0000000 0000000 00000047260 15015373413 0015431 0 ustar 00root root 0000000 0000000 # Changelog for rocFFT
Documentation for rocFFT is available at
[https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/).
## rocFFT 1.0.32 for ROCm 6.4.0
### Changed
* Building with the address sanitizer option sets xnack+ on relevant GPU
architectures and adds address-sanitizer support to runtime-compiled
kernels.
* The `AMDGPU_TARGETS` build variable should be replaced with `GPU_TARGETS`. `AMDGPU_TARGETS` is deprecated.
### Removed
* Removed ahead-of-time compiled kernels for the gfx906, gfx940, and gfx941 architectures. These architectures still
function the same, but kernels for them are now compiled at runtime.
* Removed consumer GPU architectures from the precompiled kernel cache that ships with
rocFFT. rocFFT continues to ship with a cache of precompiled RTC kernels for data-center
and workstation architectures. As before, user-level caches can be enabled by setting the
environment variable ROCFFT_RTC_CACHE_PATH to a writeable file location.
### Optimized
* Improved MPI transform performance by using all-to-all communication for global transpose operations.
Point-to-point communications are still used when all-to-all is not possible.
* Improved the performance of unit-strided, complex interleaved, forward and inverse, length (64,64,64) FFTs.
### Resolved issues
* Fixed incorrect results from 2-kernel 3D FFT plans that used non-default output strides. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/507).
* Plan descriptions can be reused with different strides for different plans. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/504).
* Fixed client packages to depend on hipRAND instead of rocRAND.
* Fixed potential integer overflows during large MPI transforms.
## rocFFT 1.0.31 for ROCm 6.3.0
### Added
* rocfft-test now includes a --smoketest option.
* Support for the gfx1151, gfx1200, and gfx1201 architectures.
* Implemented experimental APIs to allow computing FFTs on data
distributed across multiple MPI ranks. These APIs can be enabled with the
`ROCFFT_MPI_ENABLE` CMake option. This option defaults to `OFF`.
When `ROCFFT_MPI_ENABLE` is set to `ON`:
* `rocfft_plan_description_set_comm` can be called to provide an
MPI communicator to a plan description, which can then be passed
to `rocfft_plan_create`. Each rank calls
`rocfft_field_add_brick` to specify the layout of data bricks on
that rank.
* An MPI library with ROCm acceleration enabled is required at
build time and at runtime.
### Changed
* Compilation uses amdclang++ instead of hipcc.
* CLI11 replaces Boost Program Options as the command line parser for clients and samples.
## rocFFT 1.0.30 for ROCm 6.2.4
### Optimizations
* Implemented 1D kernels for factorizable sizes > 1024 and < 2048.
### Fixes
* Fixed plan creation failure on some even-length real-complex transforms that use Bluestein's algorithm.
### Additions
* GFX1151 Support
## rocFFT 1.0.29 for ROCm 6.2.1
### Optimizations
* Implemented 1D kernels for factorizable sizes < 1024
## rocFFT 1.0.28 for ROCm 6.2.0
### Optimizations
* Implemented multi-device transform for 3D pencil decomposition. Contiguous dimensions on input and output bricks
are transformed locally, with global transposes to make remaining dimensions contiguous.
### Changes
* Add option in dyna-bench to load the libs in forward and then reverse order for benchmark tests.
* Randomly generated accuracy tests are now disabled by default; these can be enabled using
the --nrand option (which defaults to 0).
* Use Bonferroni multi-hypothesis testing framework by default for benchmark tests.
## rocFFT 1.0.27 for ROCm 6.1.1
### Fixes
* Fixed kernel launch failure on execute of very large odd-length real-complex transforms.
### Additions
* Enable multi-gpu testing on systems without direct GPU-interconnects
## rocFFT 1.0.26 for ROCm 6.1.0
### Changes
* Multi-device FFTs now allow batch greater than 1
* Multi-device, real-complex FFTs are now supported
* rocFFT now statically links libstdc++ when only `std::experimental::filesystem` is available (to guard
against ABI incompatibilities with newer libstdc++ libraries that include `std::filesystem`)
## rocFFT 1.0.25 for ROCm 6.0.0
### Additions
* Implemented experimental APIs to allow computing FFTs on data distributed across multiple devices
in a single process
* `rocfft_field` is a new type that can be added to a plan description to describe the layout of FFT
input or output
* `rocfft_field_add_brick` can be called to describe the brick decomposition of an FFT field, where each
brick can be assigned a different device
These interfaces are still experimental and subject to change. We are interested in getting feedback.
You can raise questions and concerns by opening issues in the
[rocFFT issue tracker](https://github.com/ROCmSoftwarePlatform/rocFFT/issues).
Note that multi-device FFTs currently have several limitations (we plan to address these in future
releases):
* Real-complex (forward or inverse) FFTs are not supported
* Planar format fields are not supported
* Batch (the `number_of_transforms` provided to `rocfft_plan_create`) must be 1
* FFT input is gathered to the current device at run time, so all FFT data must fit on that device
### Optimizations
* Improved the performance of several 2D/3D real FFTs supported by `2D_SINGLE` kernel. Offline
tuning provides more optimization for fx90a
* Removed an extra kernel launch from even-length, real-complex FFTs that use callbacks
### Changes
* Built kernels in a solution map to the library kernel cache
* Real forward transforms (real-to-complex) no longer overwrite input; rocFFT may still overwrite real
inverse (complex-to-real) input, as this allows for faster performance
* `rocfft-rider` and `dyna-rocfft-rider` have been renamed to `rocfft-bench` and `dyna-rocfft-bench`;
these are controlled by the `BUILD_CLIENTS_BENCH` CMake option
* Links for the former file names are installed, and the former `BUILD_CLIENTS_RIDER` CMake option
is accepted for compatibility, but both will be removed in a future release
* Binaries in debug builds no longer have a `-d` suffix
### Fixes
* rocFFT now correctly handles load callbacks that convert data from a smaller data type (e.g., 16-bit
integers -> 32-bit float)
## rocFFT 1.0.24 for ROCm 5.7.0
### Optimizations
* Improved the performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use
Bluestein's algorithm
### Additions
* Implemented a solution map version converter and finished the first conversion from ver.0 to ver.1
* Version 1 removes some incorrect kernels (sbrc/sbcr using `half_lds`)
### Changes
* Moved `rocfft_rtc_helper` executable to the `lib/rocFFT` directory on Linux
* Moved library kernel cache to the `lib/rocFFT` directory
## rocFFT 1.0.23 for ROCm 5.6.0
### Additions
* Implemented half-precision transforms; these can be requested by passing `rocfft_precision_half` to
`rocfft_plan_create`
* Implemented a hierarchical solution map that saves information on how to decompose a problem
and the kernels that are used
* Implemented a first version of offline-tuner to support tuning kernels for C2C and Z2Z problems
### Changes
* Replaced `std::complex` with hipComplex data types for the data generator
* FFT plan dimensions are now sorted to be row-major internally where possible, which produces
better plans if the dimensions were accidentally specified in a different order (column-major, for
example)
* Added the `--precision` argument to benchmark and test clients (`--double` is still accepted but is
deprecated as a method to request a double-precision transform)
* Improved performance test suite statistical framework
### Fixes
* Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch
failure
## rocFFT 1.0.22 for ROCm 5.5.0
### Optimizations
* Improved the performance of 1D lengths < 2048 that use Bluestein's algorithm
* Reduced code generation time during plan creation
* Optimized 3D R2C and C2R lengths 32, 84, 128
* Optimized batched small 1D R2C and C2R cases
### Additions
* Added gfx1101 to default `AMDGPU_TARGETS`
### Changes
* Moved client programs to C++17
* Moved planar kernels and infrequently used Stockham kernels to be runtime-compiled
* Moved transpose, real-complex, Bluestein, and Stockham kernels to the library kernel cache
### Fixes
* Removed zero-length twiddle table allocations, which fixes errors from `hipMallocManaged`
* Fixed incorrect freeing of HIP stream handles during twiddle computation when multiple devices are
present
## rocFFT 1.0.21 for ROCm 5.4.3
### Fixes
* Removed the source directory from `rocm_install_targets` to prevent the installation of `rocfft.h` in an
unintended location
## rocFFT 1.0.20 for ROCm 5.4.1
### Fixes
* Fixed incorrect results on strided large 1D FFTs where batch size does not equal the stride
## rocFFT 1.0.19 for ROCm 5.4.0
### Optimizations
* Optimized some strided large 1D plans
### Additions
* Added the `rocfft_plan_description_set_scale_factor` API to efficiently multiply each output element of
an FFT by a given scaling factor
* Created a `rocfft_kernel_cache.db` file next to the installed library; SBCC, CR, and RC kernels are
moved to this file when built with the library, and are runtime-compiled for new GPU architectures
* Added gfx1100 and gfx1102 to default `AMDGPU_TARGETS`
### Changes
* Moved the runtime compilation cache to in-memory by default
* A default on-disk cache can encounter contention problems on multi-node clusters with a shared
filesystem
* rocFFT can still use an on-disk cache by setting the `ROCFFT_RTC_CACHE_PATH` environment
variable
## rocFFT 1.0.18 for ROCm 5.3.0
### Changes
* The runtime compilation cache now looks for environment variables `XDG_CACHE_HOME` (on Linux)
and `LOCALAPPDATA` (on Windows) before falling back to `HOME`
* Moved computation of the twiddle table from the host to the device
### Optimizations
* Optimized 2D R2C and C2R to use 2-kernel plans where possible
* Improved performance of the Bluestein algorithm
* Optimized sbcc-168 and 100 by using half-LDS
* Optimized length-280 2D and 3D transforms
* Added kernels for factorizable 1D lengths < 128
### Fixes
* Fixed occasional failures to parallelize runtime compilation of kernels (failures would be retried
serially and ultimately succeed, but this would take extra time)
* Fixed failures of some R2C 3D transforms that use the unsupported `TILE_UNALGNED` SBRC kernels
(an example is 98^3 R2C out-of-place)
* Fixed bugs in the `SBRC_ERC` type
## rocFFT 1.0.17 for ROCm 5.2.0
### Additions
* Packages for test and benchmark executables on all supported operating systems using CPack
* Added file and folder reorganization changes, with backward compatibility support, using
`rocm-cmake` wrapper functions
### Changes
* Improved reuse of twiddle memory between plans
* Set a default load/store callback when only one callback type is set via the API (for improved
performance)
* Updated the GoogleTest dependency to version 1.11
### Optimizations
* Introduced a new access pattern of LDS (non-linear) and applied it on sbcc kernels len 64 and 81 for a
performance improvement
* Applied `lds-non-linear`, `direct-load-to-register`, and `direct-store-from-register` on sbcr kernels for
a performance improvement
### Fixes
* Correctness of certain transforms with unusual strides
* Incorrect handling of user-specified stream for runtime-compiled kernels
* Incorrect buffer allocation in `rocfft-test` on in-place transforms with different input and output sizes
## rocFFT 1.0.16 for ROCm 5.1.0
### Changes
* Supported unaligned tile dimension for `SBRC_2D` kernels
* Improved test and benchmark infrastructure by adding RAII
* Enabled runtime compilation of length-2304 FFT kernel during plan creation
* Added tokenizer for test suite
* Reduce twiddle memory requirements for even-length, real-complex transforms
* Clients can now be built separately from the main library
### Optimizations
* Optimized more large 1D cases by using `L1D_CC` plan
* Optimized the 3D 200^3 C2R case
* Optimized the 1D 2^30 double precision on MI200
* Added padding to work buffer sizes to improve performance in many cases
### Fixes
* Fixed the correctness of some R2C transforms with unusual strides
### Removals
* The hipFFT API (header) has been removed; use the
[hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT) package or repository to obtain the API
## rocFFT 1.0.15 for ROCm 5.0.0
### Changes
* Enabled runtime compilation of single FFT kernels > length 1024
* Re-aligned the split device library into four roughly equal libraries
* Implemented the FuseShim framework to replace the original OptimizePlan
* Implemented the generic buffer-assignment framework
* The buffer assignment is no longer performed by each node--we designed a generic algorithm to
test and pick the best assignment path
* With the help of FuseShim, we can achieve the most kernel-fusions possible
* Don't read the imaginary part of the DC and Nyquist modes for even-length complex-to-real
transforms
### Optimizations
* Optimized twiddle conjugation; complex-to-complex inverse transforms should now have similar
performance to forward transforms
* Improved performance of single-kernel, small 2D transforms
## rocFFT 1.0.14 for ROCm 4.5.0
### Optimizations
* Optimized SBCC kernels of lengths 52, 60, 72, 80, 84, 96, 104, 108, 112, 160, 168, 208, 216, 224, and
240 with a new kernel generator
### Additions
* Added support for Windows 10 as a build target
### Changes
* Packaging has been split into a runtime package (`rocfft`) and a development package
(`rocfft-devel`):
The development package depends on the runtime package. When installing the runtime package,
the package manager will suggest the installation of the development package to aid users
transitioning from the previous version's combined package. This suggestion by package manager is
for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion`
feature in the runtime package is introduced as a deprecated feature and will be removed in a future
ROCm release.
### Fixes
* Fixed validation failures for even-length R2C inplace 2D and 3D cubics sizes, such as 100^2 (or ^3),
200^2 (or ^3), and 256^2 (or ^3)
* We combine two kernels (`r2c-transpose`) instead of combining the three kernels
(`stockham-r2c-transpose`)
### Changes
* Split 2D device code into separate libraries
## rocFFT 1.0.13 for ROCm 4.4.0
### Optimizations
* Improved plans by removing unnecessary transpose steps
* Optimized scheme selection for 3D problems
* Imposed fewer restrictions on `3D_BLOCK_RC` selection (more problems can use `3D_BLOCK_RC` and
have performance gains)
* Enabled `3D_RC`; some 3D problems with SBCC-supported z-dim can use fewer kernels to get
benefits
* Forced `--length` 336 336 56 (dp) to use faster `3D_RC` to prevent it from being skipped by a
conservative threshold test
* Optimized some even-length R2C/C2R cases by doing more in-place operations and combining
pre- and post-processing into Stockham kernels
* Added radix-17
### Additions
* Added a new kernel generator for select fused 2D transforms
### Fixes
* Improved large 1D transform decompositions
## rocFFT 1.0.12 for ROCm 4.3.0
### Changes
* Re-split device code into single-precision, double-precision, and miscellaneous kernels
### Fixes
* Fixed potential crashes in double-precision planar->planar transpose
* Fixed potential crashes in 3D transforms with unusual strides for SBCC-optimized sizes
* Improved buffer placement logic
### Additions
* Added a new kernel generator for select lengths; new kernels have improved performance
* Added public `rocfft_execution_info_set_load_callback` and`rocfft_execution_info_set_store_callback`
API functions to allow running extra logic when loading data from and storing data to global
memory during a transform
### Removals
* Removed R2C pair schemes and kernels
### Optimizations
* Optimized 2D and 3D R2C 100 and 1D Z2Z 2500
* Reduced number of kernels for 2D/3D sizes where higher dimension is 64, 128, 256
### Fixes
* Fixed potential crashes in 3D transforms with unusual strides, for SBCC-optimized sizes
## rocFFT 1.0.11 for ROCm 4.2.0
### Changes
* Move device code into the main library
### Optimizations
* Improved performance for single-precision kernels exercising all except radix-2/7 butterfly ops
* Minor optimization for C2R 3D 100 and 200 cube sizes
* Optimized some C2C and R2C 3D 64, 81, 100, 128, 200, and 256 rectangular sizes
* When factoring, test to see if the remaining length is explicitly supported
* Explicitly added radix-7 lengths 14, 21, and 224 to list of supported lengths
* Optimized R2C 2D and 3D 128, 200, and 256 cube sizes
### Known issues
* Fixed potential crashes in small 3D transforms with unusual strides
([issue 311](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/311))
* Fixed potential crashes when running transforms on multiple devices
([issue 310](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/310))
## rocFFT 1.0.10 for ROCm 4.1.0
### Additions
* Explicitly specify `MAX_THREADS_PER_BLOCK` through `__launch_bounds_` for all kernels
* Switched to a new syntax for specifying AMD GPU architecture names and features
### Optimizations
* Optimized C2C and R2C 3D 64, 81, 100, 128, 200, and 256 cube sizes
* Improved the performance of the standalone out-of-place transpose kernel
* Optimized the 1D length 40000 C2C case
* Enabled radix-7 for size 336
* New radix-11 and radix-13 kernels; used in length 11 and 13 (and some of their multiples)
transforms
### Changes
* rocFFT now automatically allocates a work buffer if the plan requires one and none is provided
* An explicit `rocfft_status_invalid_work_buffer` error is now returned when a work buffer of insufficient
size is provided
* Updated online documentation
* Updated Debian package name version with separated underscore ( _ )
* Adjusted accuracy test tolerances and how they are compared
### Fixes
* Fixed a 4x4x8192 accuracy failure
## rocFFT 1.0.8 for ROCm 3.10.0
### Optimizations
* Optimized the 1D length 10000 C2C case
### Changes
* Added the `BUILD_CLIENTS_ALL` CMake option
### Fixes
* Fixed the correctness of SBCC and SBRC kernels with non-unit strides
* Fixed fused C2R kernel when a Bluestein transform follows it
## rocFFT 1.0.7 for ROCm 3.9.0
### Optimizations
* New R2C and C2R fused kernels to combine pre- and post-processing steps with transpose
* Enabled diagonal transpose for 1D and 2D power-of-2 cases
* New single kernels for small power-of-2, 3, and 5 sizes
* Added more radix-7 kernels
### Changes
* Explicitly disabled XNACK and SRAM-ECC features on AMDGPU hardware
### Fixes
* Fixed 2D C2R transform with length 1 on one dimension
* Fixed a potential thread unsafety in logging
## rocFFT 1.0.6 for ROCm 3.8.0
### Optimizations
* Improved the performance of 1D batch-paired R2C transforms of odd length
* Added some radix-7 kernels
* Improved the performance for 1D length 6561 and 10000
* Improved the performance for certain 2D transform sizes
### Changes
* Allowed a static library build with `BUILD_SHARED_LIBS=OFF` CMake option
* Updated GoogleTest dependency to version 1.10
### Fixes
* Correctness of certain large 2D sizes
## rocFFT 1.0.5 for ROCM 3.7.0
### Optimizations
* Optimized C2C power-of-2 middle sizes
### Changes
* Parallelized work in unit tests and eliminated duplicate cases
### Fixes
* Correctness of certain large 1D, and 2D power-of-3 and 5 sizes
* Incorrect buffer assignment for some even-length R2C transforms
* `` inclusion on C compilers
* Incorrect results on non-unit strides with SBCC/SBRC kernels
rocFFT-rocm-6.4.3/CMakeLists.txt 0000664 0000000 0000000 00000026107 15015373413 0016355 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# We use C++17 features, this will add compile option: -std=c++17
set( CMAKE_CXX_STANDARD 17 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
set( ROCFFT_BUILD_SCOPE ON )
project( rocfft LANGUAGES CXX C )
# This finds the rocm-cmake project, and installs it if not found
# rocm-cmake contains common cmake code for rocm projects to help setup and install
set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
find_package( ROCM 0.7.3 CONFIG QUIET PATHS ${ROCM_PATH} /opt/rocm )
if( NOT ROCM_FOUND )
set( rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download" )
file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log)
list(GET status 0 status_code)
list(GET status 1 status_string)
if(NOT status_code EQUAL 0)
message(FATAL_ERROR "error: downloading
'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed
status_code: ${status_code}
status_string: ${status_string}
log: ${log}
")
endif()
message(STATUS "downloading... done")
execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzvf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} )
execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
endif( )
include( ROCMSetupVersion )
include( ROCMCreatePackage )
include( ROCMInstallTargets )
include( ROCMPackageConfigHelpers )
include( ROCMInstallSymlinks )
include( ROCMCheckTargetIds )
include( ROCMClients )
include( ROCMHeaderWrapper )
# Using standardized versioning from rocm-cmake
set ( VERSION_STRING "1.0.32" )
rocm_setup_version( VERSION ${VERSION_STRING} )
# Append our library helper cmake path and the cmake path for hip (for
# convenience).
# Users may override HIP path by specifying their own in CMAKE_MODULE_PATH
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
# Enable verbose output
option( BUILD_VERBOSE "Output additional build information" OFF )
# BUILD_SHARED_LIBS is a cmake built-in; we make it an explicit option
# such that it shows in cmake-gui
option( BUILD_SHARED_LIBS "Build rocFFT as a shared library" ON )
option( WERROR "Treat warnings as errors" OFF )
option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF)
# Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable,
# Set default to OFF since users are not likely to tune
option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF)
# Provide ability to disable hipRAND dependency
option(USE_HIPRAND "Use hipRAND to provide device-side input generation" ON)
if( USE_HIPRAND )
add_compile_definitions(USE_HIPRAND)
endif( )
# Split up function pool compilation across N files to parallelize its build
set(ROCFFT_FUNCTION_POOL_N 8 CACHE STRING "Number of files to split function_pool into for compilation")
# FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG
option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" OFF)
if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32)
rocm_wrap_header_dir(
${CMAKE_SOURCE_DIR}/library/include
PATTERNS "*.h"
GUARDS SYMLINK WRAPPER
WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR}
)
endif()
set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare )
if( WERROR )
set( WARNING_FLAGS ${WARNING_FLAGS} -Werror )
endif( )
set(DEFAULT_GPUS
gfx803
gfx900
gfx906
gfx908
gfx90a
gfx940
gfx941
gfx942
gfx1030
gfx1100
gfx1101
gfx1102
gfx1151
gfx1200
gfx1201)
if(BUILD_ADDRESS_SANITIZER)
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
add_link_options(-shared-libasan)
SET(DEFAULT_GPUS
gfx908:xnack+
gfx90a:xnack+
gfx940:xnack+
gfx941:xnack+
gfx942:xnack+)
add_link_options(-fuse-ld=lld)
set(ROCFFT_KERNEL_CACHE_ENABLE off)
add_compile_definitions(ADDRESS_SANITIZER)
endif()
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
else()
message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif()
if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." )
endif()
set(AMDGPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)")
rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}")
# Don't force, users should be able to override GPU_TARGETS at the command line if desired
set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for")
# HIP is required - library and clients use HIP to access the device
find_package( HIP REQUIRED CONFIG )
# The nvidia backend can be used to compile for CUDA devices.
# Specify the CUDA prefix in the CUDA_PREFIX variable.
# CUDA_ARCH (e.g. sm_75) is also required.
if( USE_CUDA )
if( NOT DEFINED CUDA_PREFIX )
message( FATAL_ERROR "CUDA_PREFIX variable is required (e.g. /usr/local/cuda-11.4)" )
endif()
if( NOT DEFINED CUDA_ARCH )
message( FATAL_ERROR "CUDA_ARCH variable is required. (e.g. sm_75)" )
endif()
add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__)
add_link_options(-L${CUDA_PREFIX}/lib64 -pthread)
endif( )
# hipcc automatically provides HIP include dirs and HIP platform,
# but plain clang needs to be told
if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
include_directories( ${HIP_INCLUDE_DIRS} )
if( USE_CUDA )
add_compile_definitions( __HIP_PLATFORM_NVIDIA__ )
else()
add_compile_definitions( __HIP_PLATFORM_AMD__ )
endif()
endif()
# Enable MPI support in rocFFT:
option(ROCFFT_MPI_ENABLE "Enable MPI" OFF)
option(ROCFFT_CRAY_MPI_ENABLE "Cray MPI" OFF)
if( ROCFFT_MPI_ENABLE )
find_package( MPI REQUIRED )
include_directories(SYSTEM ${MPI_INCLUDE_PATH})
endif()
add_subdirectory( library )
include( clients/cmake/build-options.cmake )
# Build clients of the library
if( BUILD_CLIENTS )
set( BUILD_CLIENTS_BENCH ON )
set( BUILD_CLIENTS_SAMPLES ON )
set( BUILD_CLIENTS_TESTS ON )
endif( )
# old name for BUILD_CLIENTS_BENCH
if( BUILD_CLIENTS_RIDER )
set( BUILD_CLIENTS_BENCH ${BUILD_CLIENTS_RIDER} )
endif()
if( BUILD_CLIENTS_SAMPLES
OR BUILD_CLIENTS_TESTS
OR BUILD_CLIENTS_BENCH )
if( NOT CLIENTS_OS )
rocm_set_os_id( CLIENTS_OS )
endif()
if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP))
set(OPENMP_DEB "libgomp1")
set(FFTW_DEB "libfftw3-bin")
if(CLIENTS_OS STREQUAL "sles")
set(OPENMP_RPM "libgomp1")
set(FFTW_RPM "libfftw3-3")
else()
set(OPENMP_RPM "libgomp")
set(FFTW_RPM "fftw-libs")
endif()
endif()
rocm_package_setup_component(clients)
if( USE_HIPRAND )
set( HIPRAND_DEP hiprand )
endif()
if(BUILD_CLIENTS_TESTS)
rocm_package_setup_client_component(
tests
DEPENDS
DEB ${OPENMP_DEB} ${FFTW_DEB} ${HIPRAND_DEP}
RPM ${OPENMP_RPM} ${FFTW_RPM} ${HIPRAND_DEP}
)
endif()
if(BUILD_CLIENTS_BENCH)
rocm_package_setup_client_component(
benchmarks
DEPENDS
DEB ${HIPRAND_DEP}
RPM ${HIPRAND_DEP}
)
endif()
add_subdirectory( clients )
endif( )
if(WIN32)
set(CPACK_SOURCE_GENERATOR "ZIP")
set(CPACK_GENERATOR "ZIP")
set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE)
set(INSTALL_PREFIX "C:/hipSDK")
set(CPACK_SET_DESTDIR OFF)
set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK")
set(CPACK_PACKAGING_INSTALL_PREFIX "")
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
endif()
# Package specific CPACK vars
string( TOLOWER "${HIP_RUNTIME}" HIP_RUNTIME_LOWER )
if( HIP_RUNTIME_LOWER STREQUAL "rocclr" )
if(BUILD_ADDRESS_SANITIZER)
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" )
else()
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
endif()
rocm_package_add_dependencies("${DEPENDS_HIP_RUNTIME} >= 4.5.0")
endif( )
set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" )
set( CPACK_RPM_PACKAGE_LICENSE "MIT" )
set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" )
set( ROCFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" )
set( package_name rocfft )
rocm_create_package(
NAME ${package_name}
DESCRIPTION "ROCm FFT library"
MAINTAINER "rocfft-maintainer@amd.com"
LDCONFIG
LDCONFIG_DIR ${ROCFFT_CONFIG_DIR}
)
rocFFT-rocm-6.4.3/CppCheckSuppressions.txt 0000664 0000000 0000000 00000000342 15015373413 0020465 0 ustar 00root root 0000000 0000000 // generator uses implicit constructors for convenience
noExplicitConstructor:library/src/device/generator/generator.h
// has some false positives and isn't hard to run manually for periodic
// dead code sweeps
unusedFunction
rocFFT-rocm-6.4.3/LICENSE.md 0000664 0000000 0000000 00000005412 15015373413 0015215 0 ustar 00root root 0000000 0000000 # License
Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
This product includes software from copyright holders as shown below, and distributed under their license terms as specified.
CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry
Schreiner under NSF AWARD 1414736. All rights reserved.
Redistribution and use in source and binary forms of CLI11, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
rocFFT-rocm-6.4.3/README.md 0000664 0000000 0000000 00000007520 15015373413 0015072 0 ustar 00root root 0000000 0000000 # rocFFT
rocFFT is a software library for computing fast Fourier transforms (FFTs) written in the HIP
programming language. It's part of AMD's software ecosystem based on
[ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD and
NVIDIA GPUs.
## Documentation
> [!NOTE]
> The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocFFT/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
To build our documentation locally, use the following code:
```Bash
cd docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Build and install
You can install rocFFT using pre-built packages or building from source.
* Installing pre-built packages:
1. Download the pre-built packages from the
[ROCm package servers](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) or use the
GitHub releases tab to download the source (this may give you a more recent version than the
pre-built packages).
2. Run: `sudo apt update && sudo apt install rocfft`
* Building from source:
rocFFT is compiled with AMD's clang++ and uses CMake. You can specify several options to customize your
build. The following commands build a shared library for supported AMD GPUs:
```bash
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_C_COMPILER=amdclang ..
make -j
```
You can compile a static library using the `-DBUILD_SHARED_LIBS=off` option.
With rocFFT, you can use indirect function calls by default; this requires ROCm 4.3 or higher. You can
use `-DROCFFT_CALLBACKS_ENABLED=off` with CMake to prevent these calls on older ROCm
compilers. Note that with this configuration, callbacks won't work correctly.
rocFFT includes the following clients:
* `rocfft-bench`: Runs general transforms and is useful for performance analysis
* `rocfft-test`: Runs various regression tests
* Various small samples
| Client | CMake option | Dependencies |
|:------|:-----------------|:-----------------|
| `rocfft-bench` | `-DBUILD_CLIENTS_BENCH=on` | hipRAND |
| `rocfft-test` | `-DBUILD_CLIENTS_TESTS=on` | hipRAND, FFTW, GoogleTest |
| samples | `-DBUILD_CLIENTS_SAMPLES=on` | None |
Clients are not built by default. To build them, use `-DBUILD_CLIENTS=on`. The build process
downloads and builds GoogleTest and FFTW if they are not already installed.
Clients can be built separately from the main library. For example, you can build all the clients with
an existing rocFFT library by invoking CMake from within the `rocFFT-src/clients` folder:
```bash
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_PREFIX_PATH=/path/to/rocFFT-lib ..
make -j
```
To install client dependencies on Ubuntu, run:
```bash
sudo apt install libgtest-dev libfftw3-dev libboost-dev
```
We use version 1.11 of GoogleTest.
## Examples
A summary of the latest functionality and workflow to compute an FFT with rocFFT is available on the
[rocFFT documentation portal](https://rocm.docs.amd.com/projects/rocFFT/en/latest/).
You can find additional examples in the `clients/samples` subdirectory.
## Support
You can report bugs and feature requests through the GitHub
[issue tracker](https://github.com/ROCm/rocFFT/issues).
## Contribute
If you want to contribute to rocFFT, you must follow our [contribution guidelines](https://github.com/ROCm/rocFFT/blob/develop/.github/CONTRIBUTING.md).
rocFFT-rocm-6.4.3/ValgrindSuppressions.txt 0000664 0000000 0000000 00000000267 15015373413 0020561 0 ustar 00root root 0000000 0000000 {
Memcheck:Param
sched_setaffinity(mask)
...
fun:hipMalloc
}
{
Memcheck:Param
sched_setaffinity(mask)
...
fun:hipMemGetInfo
} rocFFT-rocm-6.4.3/clients/ 0000775 0000000 0000000 00000000000 15015373413 0015250 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/CMakeLists.txt 0000664 0000000 0000000 00000010643 15015373413 0020014 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
set( CPACK_PACKAGING_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
set( CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
set( ROCFFT_CLIENTS_BUILD_SCOPE ON )
# This project may compile dependencies for clients
project( rocfft-clients LANGUAGES CXX C )
set(CMAKE_CXX_STANDARD 17)
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT ROCM_FOUND )
find_package( ROCM 0.7.3 REQUIRED )
endif()
include( ROCMInstallTargets )
# Adding Version File to rocfft-client, this avoids empty rocfft-client package
file ( WRITE "${PROJECT_BINARY_DIR}/package/client-version"
"${rocfft_VERSION_MAJOR}.${rocfft_VERSION_MINOR}.${rocfft_VERSION_PATCH}-${BUILD_ID}\n" )
rocm_install ( FILES ${PROJECT_BINARY_DIR}/package/client-version DESTINATION .info COMPONENT clients)
# This option only works for make/nmake and the ninja generators, but
# no reason it shouldn't be on all the time.
# This tells cmake to create a compile_commands.json file that can be
# used with clang tooling or vim.
set( CMAKE_EXPORT_COMPILE_COMMANDS ON )
if(NOT ROCFFT_BUILD_SCOPE AND
NOT BUILD_CLIENTS_SAMPLES AND
NOT BUILD_CLIENTS_TESTS AND
NOT BUILD_CLIENTS_BENCH)
set( BUILD_CLIENTS_SAMPLES ON )
set( BUILD_CLIENTS_TESTS ON )
set( BUILD_CLIENTS_BENCH ON )
endif()
# each backend requires different libraries for host and device code
if( USE_CUDA )
if( NOT DEFINED CUDA_PREFIX )
message( FATAL_ERROR "CUDA_PREFIX variable is required." )
endif()
if( NOT DEFINED CUDA_ARCH )
message( FATAL_ERROR "CUDA_ARCH variable is required." )
endif()
add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__)
add_link_options(-L${CUDA_PREFIX}/lib64 -pthread)
add_compile_options(--cuda-path=${CUDA_PREFIX} --cuda-gpu-arch=${CUDA_ARCH} -xcuda)
set( ROCFFT_CLIENTS_HOST_LINK_LIBS -lcudart -ldl -lrt )
else()
set( ROCFFT_CLIENTS_HOST_LINK_LIBS hip::host )
set( ROCFFT_CLIENTS_DEVICE_LINK_LIBS hip::device )
endif()
if( ROCFFT_MPI_ENABLE )
find_package( MPI REQUIRED )
endif()
if( BUILD_CLIENTS_SAMPLES )
add_subdirectory( samples )
endif( )
if( BUILD_CLIENTS_TESTS )
add_subdirectory( tests )
endif( )
if( BUILD_CLIENTS_BENCH )
add_subdirectory( bench )
endif( )
rocFFT-rocm-6.4.3/clients/bench/ 0000775 0000000 0000000 00000000000 15015373413 0016327 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/bench/CMakeLists.txt 0000664 0000000 0000000 00000013626 15015373413 0021077 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-bench LANGUAGES CXX )
set(CMAKE_CXX_STANDARD 17)
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
if( NOT ROCM_FOUND )
find_package( ROCM 0.7.3 REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
include( ROCMInstallTargets )
set( bench_list rocfft-bench dyna-rocfft-bench )
foreach( bench ${bench_list})
if(${bench} STREQUAL "rocfft-bench")
add_executable( ${bench} ../../shared/array_validator.cpp bench.cpp bench.h )
else()
add_executable( ${bench} ../../shared/array_validator.cpp dyna-bench.cpp bench.h )
endif()
target_compile_options( ${bench} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
# NB: hip-clang includes omp.h, so we need to specify the location
# of ROCM_CLANG_ROOT at cmake config time if we are using clang++.
target_include_directories( ${bench}
PRIVATE
$
${HIP_CLANG_ROOT}/include
${ROCM_CLANG_ROOT}/include
)
if(${bench} STREQUAL "rocfft-bench")
target_link_libraries( ${bench}
PRIVATE
hip::device
roc::rocfft
)
else()
target_link_libraries( ${bench}
PRIVATE
${CMAKE_DL_LIBS}
hip::device
)
endif()
if( USE_HIPRAND )
target_link_libraries( ${bench}
PRIVATE
hip::hiprand
)
endif()
# We need to include both rocfft.h and rocfft-export.h
target_include_directories( ${bench}
PRIVATE
${CMAKE_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/
${HIP_CLANG_ROOT}/include
)
target_link_libraries( ${bench} PUBLIC
${ROCFFT_CLIENTS_HOST_LINK_LIBS}
)
if( ROCFFT_MPI_ENABLE )
target_link_libraries( ${bench}
PRIVATE
MPI::MPI_CXX
)
if ( ROCFFT_CRAY_MPI_ENABLE)
target_link_libraries( ${bench}
PRIVATE
"mpi_gtl_hsa"
)
get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY )
target_link_directories( ${bench}
PRIVATE
${MPI_LIBDIR}/../../../../gtl/lib )
endif()
endif()
set_target_properties( ${bench} PROPERTIES
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( BENCH_OUT_DIR "/../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( BENCH_OUT_DIR "/../bin" )
else()
set( BENCH_OUT_DIR "/bin")
endif()
string( CONCAT BENCH_OUT_DIR "${PROJECT_BINARY_DIR}" ${BENCH_OUT_DIR} )
set_target_properties(${bench}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${BENCH_OUT_DIR} )
rocm_install(TARGETS ${bench} COMPONENT benchmarks)
# install compatibility for old name of bench program - symlink on
# unix, hardlink on windows (since privilege is required to create
# symlinks there)
string(REPLACE bench rider bench_legacy ${bench})
if( WIN32 )
set( BENCH_LINK_COMMAND create_hardlink )
set( BENCH_NEW_NAME ${BENCH_OUT_DIR}/$${CMAKE_EXECUTABLE_SUFFIX} )
set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy}${CMAKE_EXECUTABLE_SUFFIX} )
else()
set( BENCH_LINK_COMMAND create_symlink )
set( BENCH_NEW_NAME $ )
set( BENCH_OLD_NAME ${BENCH_OUT_DIR}/${bench_legacy} )
endif()
add_custom_command(
TARGET ${bench}
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E ${BENCH_LINK_COMMAND} ${BENCH_NEW_NAME} ${BENCH_OLD_NAME}
)
install(
FILES ${BENCH_OLD_NAME}
DESTINATION ${CMAKE_INSTALL_BINDIR}
COMPONENT benchmarks
)
endforeach()
# Link dyna-rocfft-bench to the experimental filesystem library if
# it's not available in the standard library.
include( ../../cmake/std-filesystem.cmake )
target_link_std_experimental_filesystem( dyna-rocfft-bench )
rocFFT-rocm-6.4.3/clients/bench/bench.cpp 0000664 0000000 0000000 00000045671 15015373413 0020127 0 ustar 00root root 0000000 0000000 // Copyright (C) 2016 - 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include "../../shared/CLI11.hpp"
#include "../../shared/arithmetic.h"
#include "../../shared/gpubuf.h"
#include "../../shared/hip_object_wrapper.h"
#include "../../shared/rocfft_params.h"
#include "bench.h"
#include "rocfft/rocfft.h"
int main(int argc, char* argv[])
{
// This helps with mixing output of both wide and narrow characters to the screen
std::ios::sync_with_stdio(false);
// Control output verbosity:
int verbose{};
// number of GPUs to use:
int ngpus{};
// hip Device number for running tests:
int deviceId{};
// Ignore runtime failures.
// eg: hipMalloc failing when there isn't enough free vram.
bool ignore_hip_runtime_failures{true};
// Number of performance trial samples
int ntrial{};
// FFT parameters:
rocfft_params params;
// input/output FFT grids
std::vector ingrid;
std::vector outgrid;
// Token string to fully specify fft params.
std::string token;
CLI::App app{"rocfft-bench command line options"};
// Declare the supported options. Some option pointers are declared to track passed opts.
app.add_flag("--version", "Print queryable version information from the rocfft library")
->each([](const std::string&) {
char v[256];
rocfft_get_version_string(v, 256);
std::cout << "version " << v << std::endl;
return EXIT_SUCCESS;
});
CLI::Option* opt_token
= app.add_option("--token", token, "Token to read FFT params from")->default_val("");
// Group together options that conflict with --token
auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token");
non_token
->add_flag("--double", "Double precision transform (deprecated: use --precision double)")
->each([&](const std::string&) { params.precision = fft_precision_double; });
non_token->excludes(opt_token);
non_token
->add_option("-t, --transformType",
params.transform_type,
"Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
"forward\n3) real inverse")
->default_val(fft_transform_type_complex_forward);
non_token
->add_option(
"--precision", params.precision, "Transform precision: single (default), double, half")
->excludes("--double");
CLI::Option* opt_not_in_place
= non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)")
->each([&](const std::string&) { params.placement = fft_placement_notinplace; });
non_token
->add_option("--itype",
params.itype,
"Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
non_token
->add_option("--otype",
params.otype,
"Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
CLI::Option* opt_length
= non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3);
non_token->add_option("--ngpus", ngpus, "Number of GPUs to use")
->default_val(1)
->check(CLI::NonNegativeNumber);
// define multi-GPU grids for FFT computation,
CLI::Option* opt_ingrid
= non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input")
->expected(1, 3)
->needs("--ngpus");
CLI::Option* opt_outgrid
= non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output")
->expected(1, 3)
->needs("--ngpus");
non_token
->add_option("-b, --batchSize",
params.nbatch,
"If this value is greater than one, arrays will be used")
->default_val(1);
CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides");
CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides");
non_token->add_option("--idist", params.idist, "Logical distance between input batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; });
non_token->add_option("--odist", params.odist, "Logical distance between output batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; });
CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset");
CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset");
app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures",
ignore_hip_runtime_failures,
"Ignore hip runtime failures");
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0);
app.add_option("-N, --ntrial", ntrial, "Trial size for the problem")
->default_val(1)
->each([&](const std::string& val) {
std::cout << "Running profile with " << val << " samples\n";
});
// Default value is set in fft_params.h based on if device-side PRNG was enabled.
app.add_option("-g, --inputGen",
params.igen,
"Input data generation:\n0) PRNG sequence (device)\n"
"1) PRNG sequence (host)\n"
"2) linearly-spaced sequence (device)\n"
"3) linearly-spaced sequence (host)");
app.add_option("--isize", params.isize, "Logical size of input buffer");
app.add_option("--osize", params.osize, "Logical size of output buffer");
app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output");
// Parse args and catch any errors here
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
if(!token.empty())
{
std::cout << "Reading fft params from token:\n" << token << std::endl;
try
{
params.from_token(token);
}
catch(...)
{
std::cout << "Unable to parse token." << std::endl;
return EXIT_FAILURE;
}
std::cout << std::flush;
}
else // generate token
{
if(ngpus > 1)
{
// set default GPU grids in case none were given
params.set_default_grid(ngpus, ingrid, outgrid);
// split the problem among ngpus
params.mp_lib = fft_params::fft_mp_lib_none;
int localDeviceCount = 0;
(void)hipGetDeviceCount(&localDeviceCount);
// start with all-ones in grids
std::vector input_grid(params.length.size() + 1, 1);
std::vector output_grid(params.length.size() + 1, 1);
// create input and output grids and distribute it according to user requirements
std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1);
std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1);
params.distribute_input(localDeviceCount, input_grid);
params.distribute_output(localDeviceCount, output_grid);
}
if(*opt_not_in_place)
{
std::cout << "out-of-place\n";
}
else
{
std::cout << "in-place\n";
}
if(*opt_length)
{
std::cout << "length:";
for(auto& i : params.length)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_istride)
{
std::cout << "istride:";
for(auto& i : params.istride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ostride)
{
std::cout << "ostride:";
for(auto& i : params.ostride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ioffset)
{
std::cout << "ioffset:";
for(auto& i : params.ioffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ooffset)
{
std::cout << "ooffset:";
for(auto& i : params.ooffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ingrid || !ingrid.empty())
{
std::cout << "input grid:";
for(auto& i : ingrid)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_outgrid || !outgrid.empty())
{
std::cout << "output grid:";
for(auto& i : outgrid)
std::cout << " " << i;
std::cout << "\n";
}
std::cout << "\n";
}
std::cout << std::flush;
rocfft_setup();
// Set GPU for single-device FFT computation
rocfft_scoped_device dev(deviceId);
params.validate();
if(!params.valid(verbose))
{
throw std::runtime_error("Invalid parameters, add --verbose=1 for detail");
}
std::cout << "Token: " << params.token() << std::endl;
if(verbose)
{
std::cout << params.str(" ") << std::endl;
}
// Check free and total available memory:
size_t free = 0;
size_t total = 0;
try
{
HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
const auto raw_vram_footprint
= params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
if(!vram_fits_problem(raw_vram_footprint, free))
{
std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
<< ") raw data too large for device.\n";
return EXIT_SUCCESS;
}
const auto vram_footprint = params.vram_footprint();
if(!vram_fits_problem(vram_footprint, free))
{
std::cout << "SKIPPED: Problem size (" << vram_footprint
<< ") raw data too large for device.\n";
return EXIT_SUCCESS;
}
auto ret = params.create_plan();
if(ret != fft_status_success)
LIB_V_THROW(rocfft_status_failure, "Plan creation failed");
// GPU input buffer:
auto ibuffer_sizes = params.ibuffer_sizes();
std::vector ibuffer(ibuffer_sizes.size());
std::vector pibuffer(ibuffer_sizes.size());
for(unsigned int i = 0; i < ibuffer.size(); ++i)
{
try
{
HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
pibuffer[i] = ibuffer[i].data();
}
// CPU-side input buffer
std::vector ibuffer_cpu;
auto is_host_gen = (params.igen == fft_input_generator_host
|| params.igen == fft_input_random_generator_host);
#ifdef USE_HIPRAND
if(!is_host_gen)
{
// Input data:
params.compute_input(ibuffer);
if(verbose > 1)
{
// Copy input to CPU
try
{
ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
{
try
{
HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(),
ibuffer[idx].data(),
ibuffer_sizes[idx],
hipMemcpyDeviceToHost),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::cout << "GPU input:\n";
params.print_ibuffer(ibuffer_cpu);
}
}
#endif
if(is_host_gen)
{
// Input data:
ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
params.compute_input(ibuffer_cpu);
if(verbose > 1)
{
std::cout << "GPU input:\n";
params.print_ibuffer(ibuffer_cpu);
}
for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx)
{
try
{
HIP_V_THROW(hipMemcpy(pibuffer[idx],
ibuffer_cpu[idx].data(),
ibuffer_cpu[idx].size(),
hipMemcpyHostToDevice),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
}
// GPU output buffer:
std::vector obuffer_data;
std::vector* obuffer = &obuffer_data;
if(params.placement == fft_placement_inplace)
{
obuffer = &ibuffer;
}
else
{
auto obuffer_sizes = params.obuffer_sizes();
obuffer_data.resize(obuffer_sizes.size());
for(unsigned int i = 0; i < obuffer_data.size(); ++i)
{
HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed");
}
}
std::vector pobuffer(obuffer->size());
for(unsigned int i = 0; i < obuffer->size(); ++i)
{
pobuffer[i] = obuffer->at(i).data();
}
// Scatter input out to other devices and adjust I/O buffers to match requested transform
params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
// Execute a warm-up call
params.execute(pibuffer.data(), pobuffer.data());
// Run the transform several times and record the execution time:
std::vector gpu_time(ntrial);
hipEvent_wrapper_t start, stop;
start.alloc();
stop.alloc();
for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial)
{
// Create input at every iteration to avoid overflow
if(params.ifields.empty())
{
#ifdef USE_HIPRAND
// Compute input on default device
if(!is_host_gen)
params.compute_input(ibuffer);
#endif
if(is_host_gen)
{
for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx)
{
try
{
HIP_V_THROW(hipMemcpy(pibuffer[idx],
ibuffer_cpu[idx].data(),
ibuffer_cpu[idx].size(),
hipMemcpyHostToDevice),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
}
// Scatter input out to other devices if this is a multi-GPU test
params.multi_gpu_prepare(ibuffer, pibuffer, pobuffer);
}
HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
params.execute(pibuffer.data(), pobuffer.data());
HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed");
HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed");
float time;
HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed");
gpu_time[itrial] = time;
// Print result after FFT transform
if(verbose > 2)
{
// Gather data to default GPU if this is a multi-GPU test
params.multi_gpu_finalize(*obuffer, pobuffer);
auto output = allocate_host_buffer(params.precision, params.otype, params.osize);
for(unsigned int idx = 0; idx < output.size(); ++idx)
{
try
{
HIP_V_THROW(hipMemcpy(output[idx].data(),
pobuffer.at(idx),
output[idx].size(),
hipMemcpyDeviceToHost),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::cout << "GPU output:\n";
params.print_obuffer(output);
}
}
std::cout << "\nExecution gpu time:";
for(const auto& i : gpu_time)
{
std::cout << " " << i;
}
std::cout << " ms" << std::endl;
std::cout << "Execution gflops: ";
const double totsize = product(params.length.begin(), params.length.end());
const double k
= ((params.itype == fft_array_type_real) || (params.otype == fft_array_type_real)) ? 2.5
: 5.0;
const double opscount = (double)params.nbatch * k * totsize * log(totsize) / log(2.0);
for(const auto& i : gpu_time)
{
std::cout << " " << opscount / (1e6 * i);
}
std::cout << std::endl;
rocfft_cleanup();
}
rocFFT-rocm-6.4.3/clients/bench/bench.h 0000664 0000000 0000000 00000006156 15015373413 0017567 0 ustar 00root root 0000000 0000000 // Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCFFT_BENCH_H
#define ROCFFT_BENCH_H
#include "rocfft/rocfft.h"
#include
#include
class rocfft_hip_runtime_error : public std::runtime_error
{
public:
rocfft_hip_runtime_error(const std::string& msg = "")
: runtime_error(msg)
{
}
};
// This is used to either wrap a HIP function call, or to explicitly check a variable
// for an error condition. If an error occurs, we throw.
// Note: std::runtime_error does not take unicode strings as input, so only strings
// supported
inline void
hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName)
{
if(res != hipSuccess)
{
std::stringstream tmp;
tmp << "HIP_V_THROWERROR< ";
tmp << res;
tmp << " > (";
tmp << fileName;
tmp << " Line: ";
tmp << lineno;
tmp << "): ";
tmp << msg;
std::string errorm(tmp.str());
std::cout << errorm << std::endl;
throw rocfft_hip_runtime_error(errorm);
}
}
class rocfft_runtime_error : public std::runtime_error
{
public:
rocfft_runtime_error(const std::string& msg = "")
: runtime_error(msg)
{
}
};
inline void lib_V_Throw(rocfft_status res,
const std::string& msg,
size_t lineno,
const std::string& fileName)
{
if(res != rocfft_status_success)
{
std::stringstream tmp;
tmp << "LIB_V_THROWERROR< ";
tmp << res;
tmp << " > (";
tmp << fileName;
tmp << " Line: ";
tmp << lineno;
tmp << "): ";
tmp << msg;
std::string errorm(tmp.str());
std::cout << errorm << std::endl;
throw rocfft_runtime_error(errorm);
}
}
#define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__)
#define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__)
#endif // ROCFFT_BENCH_H
rocFFT-rocm-6.4.3/clients/bench/dyna-bench.cpp 0000664 0000000 0000000 00000077471 15015373413 0021063 0 ustar 00root root 0000000 0000000 // Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
// This file allows one to run tests multiple different rocFFT libraries at the same time.
// This allows one to randomize the execution order for better a better experimental setup
// which produces fewer type 1 errors where one incorrectly rejects the null hypothesis.
#include
#if __has_include()
#include
#else
#include
namespace std
{
namespace filesystem = experimental::filesystem;
}
#endif
#include
#include
#include
#include
#ifdef WIN32
#include
// psapi.h requires windows.h to be included first
#include
#else
#include
#include
#endif
#include "../../shared/CLI11.hpp"
#include "../../shared/gpubuf.h"
#include "../../shared/hip_object_wrapper.h"
#include "../../shared/rocfft_params.h"
#include "bench.h"
#include "rocfft/rocfft.h"
#ifdef WIN32
typedef HMODULE ROCFFT_LIB;
#else
typedef void* ROCFFT_LIB;
#endif
// Load the rocfft library
ROCFFT_LIB rocfft_lib_load(const std::string& path)
{
#ifdef WIN32
return LoadLibraryA(path.c_str());
#else
return dlopen(path.c_str(), RTLD_LAZY);
#endif
}
// Return a string describing the error loading rocfft
const char* rocfft_lib_load_error()
{
#ifdef WIN32
// just return the error number
static std::string error_str;
error_str = std::to_string(GetLastError());
return error_str.c_str();
#else
return dlerror();
#endif
}
// Get symbol from rocfft lib
void* rocfft_lib_symbol(ROCFFT_LIB libhandle, const char* sym)
{
#ifdef WIN32
return reinterpret_cast(GetProcAddress(libhandle, sym));
#else
return dlsym(libhandle, sym);
#endif
}
void rocfft_lib_close(ROCFFT_LIB libhandle)
{
#ifdef WIN32
FreeLibrary(libhandle);
#else
dlclose(libhandle);
#endif
}
// Given a libhandle from dload, return a plan to a rocFFT plan with the given parameters.
rocfft_plan make_plan(ROCFFT_LIB libhandle, const fft_params& params)
{
auto procfft_setup = (decltype(&rocfft_setup))rocfft_lib_symbol(libhandle, "rocfft_setup");
if(procfft_setup == NULL)
throw rocfft_runtime_error("rocfft_setup failed");
auto procfft_plan_description_create
= (decltype(&rocfft_plan_description_create))rocfft_lib_symbol(
libhandle, "rocfft_plan_description_create");
auto procfft_plan_description_destroy
= (decltype(&rocfft_plan_description_destroy))rocfft_lib_symbol(
libhandle, "rocfft_plan_description_destroy");
auto procfft_plan_description_set_data_layout
= (decltype(&rocfft_plan_description_set_data_layout))rocfft_lib_symbol(
libhandle, "rocfft_plan_description_set_data_layout");
auto procfft_plan_create
= (decltype(&rocfft_plan_create))rocfft_lib_symbol(libhandle, "rocfft_plan_create");
procfft_setup();
rocfft_plan_description desc = NULL;
LIB_V_THROW(procfft_plan_description_create(&desc), "rocfft_plan_description_create failed");
LIB_V_THROW(
procfft_plan_description_set_data_layout(desc,
rocfft_array_type_from_fftparams(params.itype),
rocfft_array_type_from_fftparams(params.otype),
params.ioffset.data(),
params.ooffset.data(),
params.istride.size(),
params.istride.data(),
params.idist,
params.ostride.size(),
params.ostride.data(),
params.odist),
"rocfft_plan_description_data_layout failed");
rocfft_plan plan = NULL;
LIB_V_THROW(procfft_plan_create(&plan,
rocfft_result_placement_from_fftparams(params.placement),
rocfft_transform_type_from_fftparams(params.transform_type),
rocfft_precision_from_fftparams(params.precision),
params.length.size(),
params.length.data(),
params.nbatch,
desc),
"rocfft_plan_create failed");
LIB_V_THROW(procfft_plan_description_destroy(desc), "rocfft_plan_description_destroy failed");
return plan;
}
// Given a libhandle from dload and a rocFFT plan, destroy the plan.
void destroy_plan(ROCFFT_LIB libhandle, rocfft_plan& plan)
{
auto procfft_plan_destroy
= (decltype(&rocfft_plan_destroy))rocfft_lib_symbol(libhandle, "rocfft_plan_destroy");
LIB_V_THROW(procfft_plan_destroy(plan), "rocfft_plan_destroy failed");
auto procfft_cleanup
= (decltype(&rocfft_cleanup))rocfft_lib_symbol(libhandle, "rocfft_cleanup");
if(procfft_cleanup)
LIB_V_THROW(procfft_cleanup(), "rocfft_cleanup failed");
}
// Given a libhandle from dload and a rocFFT execution info structure, destroy the info.
void destroy_info(ROCFFT_LIB libhandle, rocfft_execution_info& info)
{
auto procfft_execution_info_destroy
= (decltype(&rocfft_execution_info_destroy))rocfft_lib_symbol(
libhandle, "rocfft_execution_info_destroy");
LIB_V_THROW(procfft_execution_info_destroy(info), "rocfft_execution_info_destroy failed");
}
// Given a libhandle from dload, and a corresponding rocFFT plan, return how much work
// buffer is required.
size_t get_wbuffersize(ROCFFT_LIB libhandle, const rocfft_plan& plan)
{
auto procfft_plan_get_work_buffer_size
= (decltype(&rocfft_plan_get_work_buffer_size))rocfft_lib_symbol(
libhandle, "rocfft_plan_get_work_buffer_size");
// Get the buffersize
size_t workBufferSize = 0;
LIB_V_THROW(procfft_plan_get_work_buffer_size(plan, &workBufferSize),
"rocfft_plan_get_work_buffer_size failed");
return workBufferSize;
}
// Given a libhandle from dload and a corresponding rocFFT plan, print the plan information.
void show_plan(ROCFFT_LIB libhandle, const rocfft_plan& plan)
{
auto procfft_plan_get_print
= (decltype(&rocfft_plan_get_print))rocfft_lib_symbol(libhandle, "rocfft_plan_get_print");
LIB_V_THROW(procfft_plan_get_print(plan), "rocfft_plan_get_print failed");
}
// FIXME: doc
rocfft_execution_info make_execinfo(ROCFFT_LIB libhandle)
{
auto procfft_execution_info_create = (decltype(&rocfft_execution_info_create))rocfft_lib_symbol(
libhandle, "rocfft_execution_info_create");
rocfft_execution_info info = NULL;
LIB_V_THROW(procfft_execution_info_create(&info), "rocfft_execution_info_create failed");
return info;
}
// FIXME: doc
void set_work_buffer(const ROCFFT_LIB& libhandle,
rocfft_execution_info& info,
const size_t wbuffersize,
void* wbuffer)
{
if(wbuffersize > 0 && wbuffer != NULL)
{
auto procfft_execution_info_set_work_buffer
= (decltype(&rocfft_execution_info_set_work_buffer))rocfft_lib_symbol(
libhandle, "rocfft_execution_info_set_work_buffer");
LIB_V_THROW(procfft_execution_info_set_work_buffer(info, wbuffer, wbuffersize),
"rocfft_execution_info_set_work_buffer failed");
}
}
// Given a libhandle from dload and a corresponding rocFFT plan and execution info,
// execute a transform on the given input and output buffers and return the kernel
// execution time.
float run_plan(
ROCFFT_LIB libhandle, rocfft_plan plan, rocfft_execution_info info, void** in, void** out)
{
auto procfft_execute
= (decltype(&rocfft_execute))rocfft_lib_symbol(libhandle, "rocfft_execute");
hipEvent_wrapper_t start, stop;
start.alloc();
stop.alloc();
HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
auto rcfft = procfft_execute(plan, in, out, info);
HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed");
HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed");
if(rcfft != rocfft_status_success)
{
throw std::runtime_error("execution failed");
}
float time;
HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed");
return time;
}
std::pair create_handleplan(const std::string& libstring,
const fft_params& params)
{
auto libhandle = rocfft_lib_load(libstring);
if(libhandle == NULL)
{
std::stringstream ss;
ss << "Failed to open " << libstring << ", error: " << rocfft_lib_load_error();
throw std::runtime_error(ss.str());
}
auto plan = make_plan(libhandle, params);
return std::make_pair(libhandle, plan);
}
int main(int argc, char* argv[])
{
// Control output verbosity:
int verbose{};
// number of GPUs to use:
int ngpus{};
// hip Device number for running tests:
int deviceId{};
// Ignore runtime failures.
// eg: hipMalloc failing when there isn't enough free vram.
bool ignore_hip_runtime_failures{true};
// Number of performance trial samples:
int ntrial{};
// Bool to specify whether the libs are loaded in forward or forward+reverse order.
int reverse{};
// Test sequence choice:
int test_sequence{};
// Vector of test target libraries
std::vector lib_strings;
// FFT parameters:
fft_params params;
// input/output FFT grids
std::vector ingrid;
std::vector outgrid;
// Token string to fully specify fft params.
std::string token;
CLI::App app{"dyna-rocfft-bench command line options"};
// Declare the supported options. Some option pointers are declared to track passed opts.
// FIXME: version needs to be implemented
app.add_flag("--version",
"Print queryable version information from the rocfft library and exit");
app.add_flag("--reverse", reverse, "Load libs in forward and reverse order")->default_val(1);
app.add_option(
"--sequence", test_sequence, "Test sequence:\n0) random\n1) alternating\n2) sequential")
->default_val(0);
app.add_option("--lib", lib_strings, "Set test target library full path (appendable)");
CLI::Option* opt_token
= app.add_option("--token", token, "Token to read FFT params from")->default_val("");
// Group together options that conflict with --token
auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token");
non_token
->add_flag("--double", "Double precision transform (deprecated: use --precision double)")
->each([&](const std::string&) { params.precision = fft_precision_double; });
non_token->excludes(opt_token);
non_token
->add_option("-t, --transformType",
params.transform_type,
"Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
"forward\n3) real inverse")
->default_val(fft_transform_type_complex_forward);
non_token
->add_option(
"--precision", params.precision, "Transform precision: single (default), double, half")
->excludes("--double");
CLI::Option* opt_not_in_place
= non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)")
->each([&](const std::string&) { params.placement = fft_placement_notinplace; });
non_token
->add_option("--itype",
params.itype,
"Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
non_token
->add_option("--otype",
params.otype,
"Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
CLI::Option* opt_length
= non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3);
non_token->add_option("--ngpus", ngpus, "Number of GPUs to use")
->default_val(1)
->check(CLI::NonNegativeNumber);
// define multi-GPU grids for FFT computation,
CLI::Option* opt_ingrid
= non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input")
->expected(1, 3)
->needs("--ngpus");
CLI::Option* opt_outgrid
= non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output")
->expected(1, 3)
->needs("--ngpus");
non_token
->add_option("-b, --batchSize",
params.nbatch,
"If this value is greater than one, arrays will be used")
->default_val(1);
CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides");
CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides");
non_token->add_option("--idist", params.idist, "Logical distance between input batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; });
non_token->add_option("--odist", params.odist, "Logical distance between output batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; });
CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset");
CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset");
app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures",
ignore_hip_runtime_failures,
"Ignore hip runtime failures");
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0);
app.add_option("-N, --ntrial", ntrial, "Trial size for the problem")
->default_val(1)
->each([&](const std::string& val) {
std::cout << "Running profile with " << val << " samples\n";
});
// Default value is set in fft_params.h based on if device-side PRNG was enabled.
app.add_option("-g, --inputGen",
params.igen,
"Input data generation:\n0) PRNG sequence (device)\n"
"1) PRNG sequence (host)\n"
"2) linearly-spaced sequence (device)\n"
"3) linearly-spaced sequence (host)");
app.add_option("--isize", params.isize, "Logical size of input buffer");
app.add_option("--osize", params.osize, "Logical size of output buffer");
app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output");
// Parse args and catch any errors here
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
// Check if all the provided libraries are actually there:
for(const auto& lib_string : lib_strings)
{
if(!std::filesystem::exists(lib_string))
{
std::cerr << "Error: lib " << lib_string << " does not exist\n";
return EXIT_FAILURE;
}
}
if(!token.empty())
{
std::cout << "Reading fft params from token:\n" << token << std::endl;
try
{
params.from_token(token);
}
catch(...)
{
std::cout << "Unable to parse token." << std::endl;
return EXIT_FAILURE;
}
}
else
{
if(ngpus > 1)
{
// set default GPU grids in case none were given
params.set_default_grid(ngpus, ingrid, outgrid);
// split the problem among ngpus
params.mp_lib = fft_params::fft_mp_lib_none;
int localDeviceCount = 0;
(void)hipGetDeviceCount(&localDeviceCount);
// start with all-ones in grids
std::vector input_grid(params.length.size() + 1, 1);
std::vector output_grid(params.length.size() + 1, 1);
// create input and output grids and distribute it according to user requirements
std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1);
std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1);
params.distribute_input(localDeviceCount, input_grid);
params.distribute_output(localDeviceCount, output_grid);
}
if(*opt_not_in_place)
{
std::cout << "out-of-place\n";
}
else
{
std::cout << "in-place\n";
}
if(*opt_length)
{
std::cout << "length:";
for(auto& i : params.length)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_istride)
{
std::cout << "istride:";
for(auto& i : params.istride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ostride)
{
std::cout << "ostride:";
for(auto& i : params.ostride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ioffset)
{
std::cout << "ioffset:";
for(auto& i : params.ioffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ooffset)
{
std::cout << "ooffset:";
for(auto& i : params.ooffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ingrid || !ingrid.empty())
{
std::cout << "input grid:";
for(auto& i : ingrid)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_outgrid || !outgrid.empty())
{
std::cout << "output grid:";
for(auto& i : outgrid)
std::cout << " " << i;
std::cout << "\n";
}
}
std::cout << std::flush;
// Set GPU for single-device FFT computation
rocfft_scoped_device dev(deviceId);
params.validate();
if(!params.valid(verbose))
{
throw rocfft_runtime_error("Invalid parameters, add --verbose=1 for detail");
}
std::cout << "Token: " << params.token() << std::endl;
if(verbose)
{
std::cout << params.str() << std::endl;
}
// Check free and total available memory:
size_t free = 0;
size_t total = 0;
try
{
HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
const auto raw_vram_footprint
= params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
if(!vram_fits_problem(raw_vram_footprint, free))
{
std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
<< ") raw data too large for device.\n";
return EXIT_SUCCESS;
}
// GPU input buffer:
auto ibuffer_sizes = params.ibuffer_sizes();
std::vector ibuffer(ibuffer_sizes.size());
std::vector pibuffer(ibuffer_sizes.size());
for(unsigned int i = 0; i < ibuffer.size(); ++i)
{
try
{
HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
pibuffer[i] = ibuffer[i].data();
}
// CPU-side input buffer
std::vector ibuffer_cpu;
auto is_host_gen = (params.igen == fft_input_generator_host
|| params.igen == fft_input_random_generator_host);
#ifdef USE_HIPRAND
if(!is_host_gen)
{
// Input data:
params.compute_input(ibuffer);
if(verbose > 1)
{
// Copy input to CPU
ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
for(unsigned int idx = 0; idx < ibuffer.size(); ++idx)
{
try
{
HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(),
ibuffer[idx].data(),
ibuffer_sizes[idx],
hipMemcpyDeviceToHost),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::cout << "GPU input:\n";
params.print_ibuffer(ibuffer_cpu);
}
}
#endif
if(is_host_gen)
{
// Input data:
ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize);
params.compute_input(ibuffer_cpu);
if(verbose > 1)
{
std::cout << "GPU input:\n";
params.print_ibuffer(ibuffer_cpu);
}
for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx)
{
try
{
HIP_V_THROW(hipMemcpy(pibuffer[idx],
ibuffer_cpu[idx].data(),
ibuffer_cpu[idx].size(),
hipMemcpyHostToDevice),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
}
// GPU output buffer:
std::vector obuffer_data;
std::vector* obuffer = &obuffer_data;
if(params.placement == fft_placement_inplace)
{
obuffer = &ibuffer;
}
else
{
auto obuffer_sizes = params.obuffer_sizes();
obuffer_data.resize(obuffer_sizes.size());
for(unsigned int i = 0; i < obuffer_data.size(); ++i)
{
try
{
HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]),
"Creating output Buffer failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
}
std::vector pobuffer(obuffer->size());
for(unsigned int i = 0; i < obuffer->size(); ++i)
{
pobuffer[i] = obuffer->at(i).data();
}
// Execution times for loaded libraries:
std::vector> time(lib_strings.size());
// If we are doing a reverse-run, then we need two ntrials; otherwise, just one.
std::vector ntrial_runs;
if(reverse == 0)
{
ntrial_runs.push_back(ntrial);
}
else
{
ntrial_runs.push_back((ntrial + 1) / 2);
ntrial_runs.push_back(ntrial / 2);
}
for(size_t ridx = 0; ridx < ntrial_runs.size(); ++ridx)
{
std::vector> index_lib_string;
for(size_t i = 0; i < lib_strings.size(); ++i)
{
index_lib_string.push_back(std::make_pair(i, lib_strings[i]));
}
if(ridx == 1)
{
std::reverse(index_lib_string.begin(), index_lib_string.end());
}
// Create the handles to the libs and the associated fft plans.
std::vector handle;
std::vector plan;
// Allocate the work buffer: just one, big enough for any dloaded library.
std::vector info;
size_t wbuffer_size = 0;
for(unsigned int idx = 0; idx < lib_strings.size(); ++idx)
{
std::cout << idx << ": " << lib_strings[idx] << "\n";
auto libhandle = rocfft_lib_load(lib_strings[idx]);
if(libhandle == NULL)
{
std::cout << "Failed to open " << lib_strings[idx]
<< ", error: " << rocfft_lib_load_error() << "\n";
return 1;
}
handle.push_back(libhandle);
plan.push_back(make_plan(handle[idx], params));
show_plan(handle[idx], plan[idx]);
wbuffer_size = std::max(wbuffer_size, get_wbuffersize(handle[idx], plan[idx]));
info.push_back(make_execinfo(handle[idx]));
}
std::cout << "Work buffer size: " << wbuffer_size << std::endl;
if(!vram_fits_problem(raw_vram_footprint + wbuffer_size, free))
{
std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << " + " << +wbuffer_size
<< " = " << raw_vram_footprint + wbuffer_size
<< " ) data too large for device.\n";
return EXIT_SUCCESS;
}
gpubuf wbuffer;
if(wbuffer_size)
{
try
{
HIP_V_THROW(wbuffer.alloc(wbuffer_size), "Creating intermediate Buffer failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
// Associate the work buffer to the individual libraries:
for(unsigned int idx = 0; idx < lib_strings.size(); ++idx)
{
set_work_buffer(handle[idx], info[idx], wbuffer_size, wbuffer.data());
}
// Run the plan using its associated rocFFT library:
for(unsigned int idx = 0; idx < handle.size(); ++idx)
{
try
{
run_plan(handle[idx], plan[idx], info[idx], pibuffer.data(), pobuffer.data());
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::vector testcase(ntrial_runs[ridx] * index_lib_string.size());
switch(test_sequence)
{
case 0:
{
// Random order:
for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial)
{
for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib)
{
testcase[index_lib_string.size() * itrial + ilib] = ilib;
}
}
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(testcase.begin(), testcase.end(), g);
break;
}
case 1:
// Alternating order:
for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial)
{
for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib)
{
testcase[index_lib_string.size() * itrial + ilib] = ilib;
}
}
break;
case 2:
// Sequential order:
for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial)
{
for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib)
{
testcase[ilib * ntrial + itrial] = ilib;
}
}
break;
default:
throw std::runtime_error("Invalid test sequence choice.");
}
if(verbose > 3)
{
std::cout << "Test case order:";
for(const auto val : testcase)
std::cout << " " << val;
std::cout << "\n";
}
std::cout << "Running the tests...\n";
for(size_t itest = 0; itest < testcase.size(); ++itest)
{
const int tidx = testcase[itest];
if(verbose > 3)
{
std::cout << "running test case " << tidx << " with lib "
<< index_lib_string[tidx].second << "\n";
}
#ifdef USE_HIPRAND
if(!is_host_gen)
params.compute_input(ibuffer);
#endif
if(is_host_gen)
{
for(unsigned int bidx = 0; bidx < ibuffer_cpu.size(); ++bidx)
{
try
{
HIP_V_THROW(hipMemcpy(pibuffer[bidx],
ibuffer_cpu[bidx].data(),
ibuffer_cpu[bidx].size(),
hipMemcpyHostToDevice),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
}
// Run the plan using its associated rocFFT library:
try
{
time[tidx].push_back(run_plan(
handle[tidx], plan[tidx], info[tidx], pibuffer.data(), pobuffer.data()));
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
if(verbose > 2)
{
auto output = allocate_host_buffer(params.precision, params.otype, params.osize);
for(unsigned int iout = 0; iout < output.size(); ++iout)
{
try
{
HIP_V_THROW(hipMemcpy(output[iout].data(),
pobuffer[iout],
output[iout].size(),
hipMemcpyDeviceToHost),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::cout << "GPU output:\n";
params.print_obuffer(output);
}
}
// Clean up:
for(unsigned int hidx = 0; hidx < handle.size(); ++hidx)
{
destroy_info(handle[hidx], info[hidx]);
destroy_plan(handle[hidx], plan[hidx]);
rocfft_lib_close(handle[hidx]);
}
}
std::cout << "Execution times in ms:\n";
for(unsigned int idx = 0; idx < time.size(); ++idx)
{
std::cout << "\nExecution gpu time:";
for(auto& i : time[idx])
{
std::cout << " " << i;
}
std::cout << " ms" << std::endl;
}
return EXIT_SUCCESS;
}
rocFFT-rocm-6.4.3/clients/cmake/ 0000775 0000000 0000000 00000000000 15015373413 0016330 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/cmake/build-gtest.cmake 0000664 0000000 0000000 00000004604 15015373413 0021561 0 ustar 00root root 0000000 0000000 # Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
include( ExternalProject )
option( BUILD_GTEST "Download and build GoogleTest" OFF )
if( NOT BUILD_GTEST )
find_package( GTest 1.11.0 )
endif()
if( (BUILD_GTEST OR NOT GTEST_FOUND) AND (NOT TARGET gtest) )
set(GTEST_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include)
set(GTEST_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}
${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX})
set(GTEST_SRC_URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz CACHE STRING "Location of GTest source code")
set(GTEST_SRC_SHA256 b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 CACHE STRING "SHA256 hash of GTest source code")
ExternalProject_Add(gtest
URL ${GTEST_SRC_URL}
URL_HASH SHA256=${GTEST_SRC_SHA256}
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${GTEST_LIBRARIES})
ExternalProject_Get_Property( gtest source_dir binary_dir )
endif()
rocFFT-rocm-6.4.3/clients/cmake/build-options.cmake 0000664 0000000 0000000 00000003600 15015373413 0022121 0 ustar 00root root 0000000 0000000 # Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# This file is intended to be used in two ways; independently in a stand alone PROJECT
# and as part of a superbuild. If the file is included in a stand alone project, the
# variables are not expected to be preset, and this will produce options() in the GUI
# for the user to examine. If this file is included in a superbuild, the options will be
# presented in the superbuild GUI, but then passed into the ExternalProject as -D
# parameters, which would already define them.
if( NOT BUILD_CLIENTS_TESTS )
option( BUILD_CLIENTS_TESTS "Build rocFFT unit tests" OFF )
endif( )
if( NOT BUILD_CLIENTS_BENCH )
option( BUILD_CLIENTS_BENCH "Build rocFFT benchmarks" OFF )
endif( )
if( NOT BUILD_CLIENTS_SAMPLES )
option( BUILD_CLIENTS_SAMPLES "Build rocFFT samples" OFF )
endif( )
rocFFT-rocm-6.4.3/clients/samples/ 0000775 0000000 0000000 00000000000 15015373413 0016714 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/samples/CMakeLists.txt 0000664 0000000 0000000 00000005053 15015373413 0021457 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
set( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ON )
project( rocfft-clients-samples LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
list( APPEND samples_subdirs "fixed-16" )
list( APPEND samples_subdirs "fixed-large" )
list( APPEND samples_subdirs "rocfft" )
list( APPEND samples_subdirs "multi_gpu" )
if( ROCFFT_MPI_ENABLE )
list( APPEND samples_subdirs "mpi" )
endif()
foreach( client ${samples_subdirs} )
add_subdirectory( ${client} )
endforeach( )
rocFFT-rocm-6.4.3/clients/samples/fixed-16/ 0000775 0000000 0000000 00000000000 15015373413 0020237 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/samples/fixed-16/CMakeLists.txt 0000664 0000000 0000000 00000007270 15015373413 0023005 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-fixed-16 LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
set( sample_list fixed-16-float fixed-16-double fixed-16-half )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories( ${sample}
PRIVATE $
)
target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device )
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( FIXED_16_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( FIXED_16_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( FIXED_16_OUT_DIR "/../bin" )
else()
set( FIXED_16_OUT_DIR "/bin" )
endif()
string( CONCAT FIXED_16_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_16_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${FIXED_16_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} )
endforeach( )
rocFFT-rocm-6.4.3/clients/samples/fixed-16/fixed-16-double.cpp 0000664 0000000 0000000 00000011434 15015373413 0023541 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
int main()
{
const size_t N = 16;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(double2);
// Create HIP device object.
double2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = NULL;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
NULL)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/fixed-16/fixed-16-float.cpp 0000664 0000000 0000000 00000011430 15015373413 0023370 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
int main()
{
const size_t N = 16;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(float2);
// Create HIP device object.
float2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = NULL;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
1,
&length,
1,
NULL)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/fixed-16/fixed-16-half.cpp 0000664 0000000 0000000 00000011660 15015373413 0023202 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
int main()
{
const size_t N = 16;
std::vector<_Float16_2> cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7));
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(_Float16_2);
// Create HIP device object.
_Float16_2* x = nullptr;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = NULL;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_half,
1,
&length,
1,
NULL)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector<_Float16_2> y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << static_cast(cx[i].x) << ","
<< static_cast(cx[i].y) << ")"
<< " output: (" << static_cast(y[i].x) << ","
<< static_cast(y[i].y) << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/fixed-large/ 0000775 0000000 0000000 00000000000 15015373413 0021103 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/samples/fixed-large/CMakeLists.txt 0000664 0000000 0000000 00000007274 15015373413 0023655 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-fixed-large LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
set( sample_list fixed-large-float fixed-large-double )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories( ${sample}
PRIVATE $
)
target_link_libraries( ${sample} PRIVATE roc::rocfft )
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( FIXED_LARGE_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( FIXED_LARGE_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( FIXED_LARGE_OUT_DIR "/../bin" )
else()
set( FIXED_LARGE_OUT_DIR "/bin" )
endif()
string( CONCAT FIXED_LARGE_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_LARGE_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${FIXED_LARGE_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} )
endforeach( )
rocFFT-rocm-6.4.3/clients/samples/fixed-large/fixed-large-double.cpp 0000664 0000000 0000000 00000011654 15015373413 0025255 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#include
#include
#include "rocfft/rocfft.h"
#include
#include
int main()
{
// For size N >= 8192, temporary buffer is required to allocated
const size_t N = 64 * 2048;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(double2);
// Create HIP device object.
double2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = nullptr;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
nullptr)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Setup work buffer
void* workBuffer = nullptr;
size_t workBufferSize = 0;
if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
// Setup exec info to pass work buffer to the library
rocfft_execution_info info = nullptr;
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(workBufferSize > 0)
{
printf("size of workbuffer=%d\n", (int)workBufferSize);
if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
if(workBuffer)
if(hipFree(workBuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/fixed-large/fixed-large-float.cpp 0000664 0000000 0000000 00000011650 15015373413 0025104 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#include
#include
#include "rocfft/rocfft.h"
#include
#include
int main()
{
// For size N >= 8192, temporary buffer is required to allocated
const size_t N = 64 * 2048;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(float2);
// Create HIP device object.
float2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = nullptr;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
1,
&length,
1,
nullptr)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Setup work buffer
void* workBuffer = nullptr;
size_t workBufferSize = 0;
if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
// Setup exec info to pass work buffer to the library
rocfft_execution_info info = nullptr;
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(workBufferSize > 0)
{
printf("size of workbuffer=%d\n", (int)workBufferSize);
if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
if(workBuffer)
if(hipFree(workBuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/mpi/ 0000775 0000000 0000000 00000000000 15015373413 0017501 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/samples/mpi/CMakeLists.txt 0000664 0000000 0000000 00000010562 15015373413 0022245 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-rocfft LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
if( NOT MPI_FOUND )
find_package( MPI REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
set( sample_list rocfft_mpi_example )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories(
${sample}
PRIVATE
$
${MPI_CXX_INCLUDE_PATH}
)
target_link_libraries(
${sample}
PRIVATE roc::rocfft
MPI::MPI_CXX
)
message( "MPI_CXX_LIB_NAMES: ${MPI_CXX_LIB_NAMES}")
if ( ROCFFT_CRAY_MPI_ENABLE )
target_link_libraries( ${sample}
PRIVATE
"mpi_gtl_hsa"
)
get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY )
target_link_directories( ${sample}
PRIVATE
${MPI_LIBDIR}/../../../../gtl/lib )
endif()
if ( USE_HIPRAND )
target_link_libraries(
${sample}
PRIVATE
hip::hiprand
)
endif()
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../bin" )
else()
set( SAMPLES_ROCFFT_OUT_DIR "/bin" )
endif()
string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${SAMPLES_ROCFFT_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
endforeach( )
rocFFT-rocm-6.4.3/clients/samples/mpi/rocfft_mpi_example.cpp 0000664 0000000 0000000 00000037732 15015373413 0024064 0 ustar 00root root 0000000 0000000
/******************************************************************************
* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include "rocfft.h"
// Check all ranks for an rocFFT non-success status.
auto rocfft_status_sync(const rocfft_status fftrc, const MPI_Comm comm)
{
// Since hipSuccess is the lowest enum value, we can find if there are any errors
// by getting the maximum value of the return code over all procs.
// Guarantee that the enum is an unsigned int so that we can send this via MPI:
static_assert(std::is_same_v, unsigned int>);
auto global_fftrc = rocfft_status_success;
const auto mpirc = MPI_Allreduce(&fftrc, &global_fftrc, 1, MPI_UNSIGNED, MPI_MAX, comm);
if(mpirc != MPI_SUCCESS)
{
return rocfft_status_failure;
}
return global_fftrc;
}
// Check all ranks for an hip runtime non-success status.
auto hip_status_sync(const hipError_t hiprc, const MPI_Comm comm)
{
// Since rocfft_status_success is the lowest enum value, we can find if there are any errors
// by getting the maximum value of the return code over all procs.
// Guarantee that the enum is an unsigned int so that we can send this via MPI:
static_assert(std::is_same_v, unsigned int>);
auto global_hiprc = hipSuccess;
const auto mpirc = MPI_Allreduce(&hiprc, &global_hiprc, 1, MPI_UNSIGNED, MPI_MAX, comm);
if(mpirc != MPI_SUCCESS)
{
return hipErrorUnknown;
}
return global_hiprc;
}
int main(int argc, char** argv)
{
MPI_Init(&argc, &argv);
MPI_Comm mpi_comm = MPI_COMM_WORLD;
int mpi_size = 0;
MPI_Comm_size(mpi_comm, &mpi_size);
int mpi_rank = 0;
MPI_Comm_rank(mpi_comm, &mpi_rank);
if(mpi_rank == 0)
{
std::cout << "rocFFT MPI example\n";
std::cout << "MPI size: " << mpi_size << "\n";
}
// General FFT parameters:
std::vector length = {8, 8};
const rocfft_transform_type direction = rocfft_transform_type_complex_forward;
const rocfft_result_placement place = rocfft_placement_notinplace;
auto fftrc = rocfft_status_success;
auto hiprc = hipSuccess;
fftrc = rocfft_setup();
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to set up rocFFT");
rocfft_plan_description description = nullptr;
rocfft_plan_description_create(&description);
fftrc = rocfft_plan_description_set_comm(description, rocfft_comm_mpi, &mpi_comm);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed add communicator to description");
// Do not set stride information via the descriptor, they are to be defined during field
// creation below
fftrc = rocfft_plan_description_set_data_layout(description,
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved,
nullptr,
nullptr,
0,
nullptr,
0,
0,
nullptr,
0);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to create description");
if(mpi_rank == 0)
{
std::cout << "input data decomposition:\n";
}
std::vector gpu_in = {nullptr};
{
rocfft_field infield = nullptr;
rocfft_field_create(&infield);
std::vector inbrick_stride = {1, length[1]};
const size_t inbrick_length1 = length[1] / (size_t)mpi_size
+ ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0);
const size_t inbrick_lower1
= mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size);
const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1;
std::vector inbrick_lower = {0, inbrick_lower1};
std::vector inbrick_upper = {length[0], inbrick_upper1};
rocfft_brick inbrick = nullptr;
rocfft_brick_create(&inbrick,
inbrick_lower.data(),
inbrick_upper.data(),
inbrick_stride.data(),
inbrick_lower.size(),
0);
rocfft_field_add_brick(infield, inbrick);
rocfft_brick_destroy(inbrick);
inbrick = nullptr;
const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex);
std::vector> host_in(length[0] * inbrick_length1);
for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0)
{
for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1)
{
const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0]
+ (idx1 - inbrick_lower[1]) * inbrick_stride[1];
host_in[pos] = std::complex(idx0, idx1);
}
}
// Serialize output:
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "in-brick rank " << irank;
std::cout << "\n\tlower indices:";
for(const auto val : inbrick_lower)
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : inbrick_upper)
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : inbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0)
{
for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1)
{
const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0]
+ (idx1 - inbrick_lower[1]) * inbrick_stride[1];
std::cout << host_in[pos] << " ";
}
std::cout << "\n";
}
}
MPI_Barrier(mpi_comm);
}
hiprc = hipMalloc(&gpu_in[0], memSize);
if(hiprc != hipSuccess)
throw std::runtime_error("inbrick hipMalloc failed");
hiprc = hipMemcpy(gpu_in[0], host_in.data(), memSize, hipMemcpyHostToDevice);
if(hiprc != hipSuccess)
throw std::runtime_error("inbrick hipMemcpy failed");
rocfft_plan_description_add_infield(description, infield);
fftrc = rocfft_field_destroy(infield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy infield");
}
if(mpi_rank == 0)
{
std::cout << "output data decomposition:\n";
}
std::vector gpu_out = {nullptr};
std::vector outbrick_lower;
std::vector outbrick_upper;
std::vector outbrick_stride = {1, length[1]};
{
const size_t outbrick_length1 = length[1] / (size_t)mpi_size
+ ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0);
const size_t outbrick_lower1
= mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size);
const size_t outbrick_upper1 = outbrick_lower1 + outbrick_length1;
outbrick_lower = {0, outbrick_lower1};
outbrick_upper = {length[0], outbrick_upper1};
const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex);
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "out-brick rank " << irank;
std::cout << "\n\tlower indices:";
for(const auto val : outbrick_lower)
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : outbrick_upper)
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : outbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
}
MPI_Barrier(mpi_comm);
}
rocfft_field outfield = nullptr;
rocfft_field_create(&outfield);
rocfft_brick outbrick = nullptr;
outbrick_lower = {0, outbrick_lower1};
outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1};
rocfft_brick_create(&outbrick,
outbrick_lower.data(),
outbrick_upper.data(),
outbrick_stride.data(),
outbrick_lower.size(),
0);
rocfft_field_add_brick(outfield, outbrick);
rocfft_brick_destroy(outbrick);
outbrick = nullptr;
hiprc = hipMalloc(&gpu_out[0], memSize);
if(hiprc != hipSuccess)
throw std::runtime_error("outbrick hipMalloc failed");
rocfft_plan_description_add_outfield(description, outfield);
fftrc = rocfft_field_destroy(outfield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy outfield");
}
// In order still handle non-success return codes without killing all of the MPI processes, we
// put object creation in a try/catch block and destroy non-nullptr objects.
// Serialize output:
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "rank " << irank << "\n";
std::cout << "input ";
for(const auto& b : gpu_in)
std::cout << " " << b;
std::cout << "\n";
std::cout << "output ";
for(const auto& b : gpu_out)
std::cout << " " << b;
std::cout << "\n";
}
MPI_Barrier(mpi_comm);
}
fftrc = rocfft_status_sync(fftrc, mpi_comm);
hiprc = hip_status_sync(hiprc, mpi_comm);
if(mpi_rank == 0)
{
if(fftrc == rocfft_status_success && hiprc == hipSuccess)
{
std::cout << "so far so good, trying to make a plan....\n";
}
else
{
std::cout << "failure: will not make a plan....\n";
}
}
// Create a multi-process plan:
rocfft_plan gpu_plan = nullptr;
if(fftrc == rocfft_status_success && hiprc == hipSuccess)
{
fftrc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
description); // Description
}
fftrc = rocfft_status_sync(fftrc, mpi_comm);
if(mpi_rank == 0)
{
if(fftrc == rocfft_status_success)
{
std::cout << "so far so good, we have a plan....\n";
}
else
{
std::cout << "failure: we do not have a plan....\n";
}
}
// Execute plan:
if(fftrc == rocfft_status_success)
{
fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), nullptr);
}
fftrc = rocfft_status_sync(fftrc, mpi_comm);
if(mpi_rank == 0)
{
if(fftrc == rocfft_status_success)
{
std::cout << "The FFT was succesful....\n";
}
else
{
std::cout << "The FFT execution failed....\n";
}
}
// Output the data:
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "out brick rank " << irank << "\n";
const size_t outcount
= (outbrick_upper[0] - outbrick_lower[0]) * (outbrick_upper[1] - outbrick_lower[1]);
std::vector> host_out(outcount);
hiprc = hipMemcpy(host_out.data(),
gpu_out[0],
outcount * sizeof(std::complex),
hipMemcpyDeviceToHost);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMemcpy failed");
for(auto idx0 = outbrick_lower[0]; idx0 < outbrick_upper[0]; ++idx0)
{
for(auto idx1 = outbrick_lower[1]; idx1 < outbrick_upper[1]; ++idx1)
{
const auto pos = (idx0 - outbrick_lower[0]) * outbrick_stride[0]
+ (idx1 - outbrick_lower[1]) * outbrick_stride[1];
std::cout << host_out[pos] << " ";
}
std::cout << "\n";
}
}
MPI_Barrier(mpi_comm);
}
// Cleanup anything plan-generation structs (that aren't null pointers):
if(description != nullptr)
{
if(rocfft_plan_description_destroy(description) != rocfft_status_success)
{
std::cerr << "description descruction failed\n";
}
else
{
description = nullptr;
}
}
// Clean up the plan and rocfft:
try
{
if(gpu_plan != nullptr)
{
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
}
}
catch(const std::exception&)
{
std::cerr << "rank " << mpi_rank << " plan destroy failed\n";
}
for(auto& buf : gpu_in)
{
if(buf != nullptr)
{
hiprc = hipFree(buf);
if(hiprc != hipSuccess)
std::cerr << "hipFree failed\n";
buf = nullptr;
}
}
for(auto& buf : gpu_out)
{
if(buf != nullptr)
{
hiprc = hipFree(buf);
if(hiprc != hipSuccess)
std::cerr << "hipFree failed\n";
buf = nullptr;
}
}
fftrc = rocfft_cleanup();
MPI_Finalize();
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/multi_gpu/ 0000775 0000000 0000000 00000000000 15015373413 0020721 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/samples/multi_gpu/CMakeLists.txt 0000664 0000000 0000000 00000007664 15015373413 0023476 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-multi_gpu LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
set( sample_list mgpu_complex)
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories(
${sample}
PRIVATE
$
)
target_link_libraries(
${sample}
PRIVATE roc::rocfft
)
if( USE_HIPRAND )
target_link_libraries(
${sample}
PRIVATE
hip::hiprand
)
endif()
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../bin" )
else()
set( SAMPLES_ROCFFT_OUT_DIR "/bin" )
endif()
string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${SAMPLES_ROCFFT_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
endforeach( )
rocFFT-rocm-6.4.3/clients/samples/multi_gpu/mgpu_complex.cpp 0000664 0000000 0000000 00000032345 15015373413 0024133 0 ustar 00root root 0000000 0000000 // Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include
#include "../../../shared/CLI11.hpp"
#include "rocfft/rocfft.h"
#include
#include
#include
int main(int argc, char* argv[])
{
std::cout << "rocfft single-node multi-gpu complex-to-complex 3D FFT example\n";
// Length of transform, first dimension must be greather than number of GPU devices
std::vector length = {8, 8};
// Gpu device ids:
std::vector devices = {0, 1};
// Command-line options:
CLI::App app{"rocfft sample command line options"};
app.add_option("--length", length, "2-D FFT size (eg: --length 256 256)");
app.add_option(
"--devices", devices, "List of devices to use separated by spaces (eg: --devices 1 3)");
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
int deviceCount = devices.size();
std::cout << "Using " << deviceCount << " device(s)\n";
int nDevices;
(void)hipGetDeviceCount(&nDevices);
std::cout << "Number of available GPUs: " << nDevices << " \n";
if(nDevices <= static_cast(*std::max_element(devices.begin(), devices.end())))
throw std::runtime_error("device ID greater than number of available devices");
// Placeness for the transform
auto fftrc = rocfft_status_success;
fftrc = rocfft_setup();
if(fftrc != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
const rocfft_result_placement place = rocfft_placement_notinplace;
// Direction of transform
const rocfft_transform_type direction = rocfft_transform_type_complex_forward;
rocfft_plan_description description = nullptr;
rocfft_plan_description_create(&description);
// Do not set stride information via the descriptor, they are to be defined during field
// creation below
rocfft_plan_description_set_data_layout(description,
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved,
nullptr,
nullptr,
0,
nullptr,
0,
0,
nullptr,
0);
auto hiprc = hipSuccess;
std::cout << "input data decomposition:\n";
std::vector gpu_in(devices.size());
{
// Row-major stride for brick data layout in memory
std::vector inbrick_stride = {1, length[1]};
rocfft_field infield = nullptr;
rocfft_field_create(&infield);
std::vector> inbrick_lower(gpu_in.size());
std::vector> inbrick_upper(gpu_in.size());
for(size_t idx = 0; idx < gpu_in.size(); ++idx)
{
const size_t inbrick_length1
= length[1] / gpu_in.size() + (idx < length[1] % gpu_in.size() ? 1 : 0);
const size_t inbrick_lower1
= idx * (length[1] / gpu_in.size()) + std::min(idx, length[1] % gpu_in.size());
const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1;
inbrick_lower[idx] = {0, inbrick_lower1};
inbrick_upper[idx] = {length[0], inbrick_upper1};
rocfft_brick inbrick = nullptr;
rocfft_brick_create(&inbrick,
inbrick_lower[idx].data(),
inbrick_upper[idx].data(),
inbrick_stride.data(),
inbrick_lower[idx].size(),
devices[idx]);
rocfft_field_add_brick(infield, inbrick);
rocfft_brick_destroy(inbrick);
inbrick = nullptr;
const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex);
std::cout << "in-brick " << idx;
std::cout << "\n\tlower indices:";
for(const auto val : inbrick_lower[idx])
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : inbrick_upper[idx])
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : inbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
hiprc = hipSetDevice(devices[idx]);
if(hiprc != hipSuccess)
throw std::runtime_error("hipSetDevice failed");
hiprc = hipMalloc(&gpu_in[idx], memSize);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMalloc failed");
std::vector> host_in(length[0] * inbrick_length1);
for(auto idx0 = inbrick_lower[idx][0]; idx0 < inbrick_upper[idx][0]; ++idx0)
{
for(auto idx1 = inbrick_lower[idx][1]; idx1 < inbrick_upper[idx][1]; ++idx1)
{
const auto pos = (idx0 - inbrick_lower[idx][0]) * inbrick_stride[0]
+ (idx1 - inbrick_lower[idx][1]) * inbrick_stride[1];
host_in[pos] = std::complex(idx0, idx1);
std::cout << host_in[pos] << " ";
}
std::cout << "\n";
}
hiprc = hipMemcpy(gpu_in[idx], host_in.data(), memSize, hipMemcpyHostToDevice);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMemcpy failed");
}
rocfft_plan_description_add_infield(description, infield);
fftrc = rocfft_field_destroy(infield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy infield");
}
std::cout << "output data decomposition:\n";
std::vector gpu_out(devices.size());
std::vector> outbrick_lower(gpu_out.size());
std::vector> outbrick_upper(gpu_out.size());
std::vector outbrick_stride = {1, length[1]};
{
rocfft_field outfield = nullptr;
rocfft_field_create(&outfield);
for(size_t idx = 0; idx < gpu_out.size(); ++idx)
{
const size_t outbrick_length1
= length[1] / gpu_out.size() + (idx < length[1] % gpu_in.size() ? 1 : 0);
const size_t outbrick_lower1
= idx * (length[1] / gpu_out.size()) + std::min(idx, length[1] % gpu_out.size());
rocfft_brick outbrick = nullptr;
outbrick_lower[idx] = {0, outbrick_lower1};
outbrick_upper[idx] = {length[0], outbrick_lower1 + outbrick_length1};
rocfft_brick_create(&outbrick,
outbrick_lower[idx].data(),
outbrick_upper[idx].data(),
outbrick_stride.data(),
outbrick_lower[idx].size(),
devices[idx]);
rocfft_field_add_brick(outfield, outbrick);
rocfft_brick_destroy(outbrick);
outbrick = nullptr;
const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex);
std::cout << "out-brick " << idx;
std::cout << "\n\tlower indices:";
for(const auto val : outbrick_lower[idx])
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : outbrick_upper[idx])
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : outbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
(void)hipSetDevice(devices[idx]);
if(hipMalloc(&gpu_out[idx], memSize) != hipSuccess)
throw std::runtime_error("hipMalloc failed");
}
rocfft_plan_description_add_outfield(description, outfield);
fftrc = rocfft_field_destroy(outfield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy outfield");
}
// Create a multi-gpu plan:
(void)hipSetDevice(devices[0]);
rocfft_plan gpu_plan = nullptr;
fftrc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
description); // Description
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to create plan");
// Get execution information and allocate work buffer
rocfft_execution_info planinfo = nullptr;
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(gpu_plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&planinfo) != rocfft_status_success)
throw std::runtime_error("failed to create execution info");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed");
if(rocfft_execution_info_set_work_buffer(planinfo, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan:
fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), planinfo);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to execute.");
// Output the data.
for(size_t idx = 0; idx < gpu_out.size(); ++idx)
{
std::cout << "out brick " << idx << "\n";
const auto nbrick = (outbrick_upper[idx][0] - outbrick_lower[idx][0])
* (outbrick_upper[idx][1] - outbrick_lower[idx][1]);
std::vector> host_out(nbrick);
hiprc = hipMemcpy(host_out.data(),
gpu_out[idx],
nbrick * sizeof(std::complex),
hipMemcpyDeviceToHost);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMemcpy failed");
for(auto idx0 = outbrick_lower[idx][0]; idx0 < outbrick_upper[idx][0]; ++idx0)
{
for(auto idx1 = outbrick_lower[idx][1]; idx1 < outbrick_upper[idx][1]; ++idx1)
{
const auto pos = (idx0 - outbrick_lower[idx][0]) * outbrick_stride[0]
+ (idx1 - outbrick_lower[idx][1]) * outbrick_stride[1];
std::cout << host_out[pos] << " ";
}
std::cout << "\n";
}
}
// Destroy plan
if(planinfo != nullptr)
{
if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
planinfo = nullptr;
}
if(rocfft_plan_description_destroy(description) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_description_destroy failed.");
description = nullptr;
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
for(size_t idx = 0; idx < gpu_in.size(); ++idx)
{
(void)hipFree(gpu_in[idx]);
}
for(size_t idx = 0; idx < gpu_out.size(); ++idx)
{
(void)hipFree(gpu_out[idx]);
}
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/rocfft/ 0000775 0000000 0000000 00000000000 15015373413 0020177 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/samples/rocfft/CMakeLists.txt 0000664 0000000 0000000 00000010036 15015373413 0022737 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-rocfft LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
set( sample_list rocfft_example_complexcomplex rocfft_example_realcomplex rocfft_example_set_stream
rocfft_example_callback )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories(
${sample}
PRIVATE
$
)
target_link_libraries(
${sample}
PRIVATE roc::rocfft
)
if( USE_HIPRAND )
target_link_libraries(
${sample}
PRIVATE
hip::hiprand
)
endif()
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../bin" )
else()
set( SAMPLES_ROCFFT_OUT_DIR "/bin" )
endif()
string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${SAMPLES_ROCFFT_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
endforeach( )
rocFFT-rocm-6.4.3/clients/samples/rocfft/examplekernels.h 0000664 0000000 0000000 00000036177 15015373413 0023405 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef EXAMPLEKERNELS_H
#define EXAMPLEKERNELS_H
#include "../../../shared/data_gen_device.h"
#include
#include
#include
// Kernel for initializing 1D real input data on the GPU.
__global__ void initrdata1(double* x, const size_t Nx, const size_t xstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < Nx)
{
const auto pos = idx * xstride;
x[pos] = idx + 1;
}
}
// Kernel for initializing 2D real input data on the GPU.
__global__ void initrdata2(
double* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
if(idx < Nx && idy < Ny)
{
const auto pos = idx * xstride + idy * ystride;
x[pos] = idx + idy;
}
}
// Kernel for initializing 3D real input data on the GPU.
__global__ void initrdata3(double* x,
const size_t Nx,
const size_t Ny,
const size_t Nz,
const size_t xstride,
const size_t ystride,
const size_t zstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
const size_t idz = blockIdx.z * blockDim.z + threadIdx.z;
if(idx < Nx && idy < Ny && idz < Nz)
{
const auto pos = idx * xstride + idy * ystride + idz * zstride;
x[pos] = cos(cos(idx + 2)) * sin(idy * idy + 1) / (idz + 1);
}
}
// Kernel for initializing 1D complex data on the GPU.
__global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < Nx)
{
const auto pos = idx * xstride;
x[pos].x = 1 + idx;
x[pos].y = 1 + idx;
}
}
// Kernel for initializing 2D complex input data on the GPU.
__global__ void initcdata2(hipDoubleComplex* x,
const size_t Nx,
const size_t Ny,
const size_t xstride,
const size_t ystride)
{
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idy = blockIdx.y * blockDim.y + threadIdx.y;
if(idx < Nx && idy < Ny)
{
const auto pos = idx * xstride + idy * ystride;
x[pos].x = idx + 1;
x[pos].y = idy + 1;
}
}
// Kernel for initializing 3D complex input data on the GPU.
__global__ void initcdata3(hipDoubleComplex* x,
const size_t Nx,
const size_t Ny,
const size_t Nz,
const size_t xstride,
const size_t ystride,
const size_t zstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
const size_t idz = blockIdx.z * blockDim.z + threadIdx.z;
if(idx < Nx && idy < Ny && idz < Nz)
{
const auto pos = idx * xstride + idy * ystride + idz * zstride;
x[pos].x = idx + 10.0 * idz + 1;
x[pos].y = idy + 10;
}
}
// Helper function for determining grid dimensions
template
Tint1 ceildiv(const Tint1 nominator, const Tint2 denominator)
{
return (nominator + denominator - 1) / denominator;
}
// The following functions call the above kernels to initalize the input data for the transform.
void initcomplex_cm(const std::vector& length_cm,
const std::vector& stride_cm,
void* gpu_in)
{
size_t blockSize = DATA_GEN_THREADS;
const dim3 blockdim(blockSize);
switch(length_cm.size())
{
case 1:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x));
hipLaunchKernelGGL(initcdata1,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
length_cm[0],
stride_cm[0]);
break;
}
case 2:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y));
hipLaunchKernelGGL(initcdata2,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
length_cm[0],
length_cm[1],
stride_cm[0],
stride_cm[1]);
break;
}
case 3:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x),
ceildiv(length_cm[1], blockdim.y),
ceildiv(length_cm[2], blockdim.z));
hipLaunchKernelGGL(initcdata3,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
length_cm[0],
length_cm[1],
length_cm[2],
stride_cm[0],
stride_cm[1],
stride_cm[2]);
break;
}
default:
std::cout << "invalid dimension!\n";
exit(1);
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("init_complex_data kernel launch failure: "
+ std::string(hipGetErrorName(err)));
}
// Initialize the real input buffer where the data has lengths given in length and stride given in
// stride. The device buffer is assumed to have been allocated.
void initreal_cm(const std::vector& length_cm,
const std::vector& stride_cm,
void* gpu_in)
{
size_t blockSize = DATA_GEN_THREADS;
const dim3 blockdim(blockSize);
switch(length_cm.size())
{
case 1:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x));
hipLaunchKernelGGL(
initrdata1, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], stride_cm[0]);
break;
}
case 2:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y));
hipLaunchKernelGGL(initrdata2,
griddim,
blockdim,
0,
0,
(double*)gpu_in,
length_cm[0],
length_cm[1],
stride_cm[0],
stride_cm[1]);
break;
}
case 3:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x),
ceildiv(length_cm[1], blockdim.y),
ceildiv(length_cm[2], blockdim.z));
hipLaunchKernelGGL(initrdata3,
griddim,
blockdim,
0,
0,
(double*)gpu_in,
length_cm[0],
length_cm[1],
length_cm[2],
stride_cm[0],
stride_cm[1],
stride_cm[2]);
break;
}
default:
std::cout << "invalid dimension!\n";
exit(1);
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("init_real_data kernel launch failure: "
+ std::string(hipGetErrorName(err)));
}
// Imposes Hermitian symmetry for the input device buffer.
// Note: input parameters are in column-major ordering.
void impose_hermitian_symmetry_cm(const std::vector& length,
const std::vector& ilength,
const std::vector& stride,
void* gpu_in)
{
size_t batch = 1;
size_t dist = 1;
size_t blockSize = DATA_GEN_THREADS;
auto inputDim = length.size();
// Launch impose_hermitian_symmetry kernels.
// NOTE: input parameters must be in row-major
// ordering for these kernels.
switch(inputDim)
{
case 1:
{
const auto gridDim = dim3(DivRoundingUp(batch, blockSize));
const auto blockDim = dim3(blockSize);
hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel,
gridDim,
blockDim,
0,
0,
(hipDoubleComplex*)gpu_in,
length[0],
stride[0],
dist,
batch,
length[0] % 2 == 0);
break;
}
case 2:
{
const auto gridDim = dim3(DivRoundingUp(batch, blockSize),
DivRoundingUp((length[1] + 1) / 2 - 1, blockSize));
const auto blockDim = dim3(blockSize, blockSize);
hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel,
gridDim,
blockDim,
0,
0,
(hipDoubleComplex*)gpu_in,
length[1],
length[0],
stride[1],
stride[0],
dist,
batch,
(ilength[1] + 1) / 2 - 1,
length[1] % 2 == 0,
length[0] % 2 == 0);
break;
}
case 3:
{
const auto gridDim = dim3(DivRoundingUp(batch, blockSize),
DivRoundingUp((length[2] + 1) / 2 - 1, blockSize),
DivRoundingUp(length[1] - 1, blockSize));
const auto blockDim = dim3(blockSize, blockSize, blockSize);
hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel,
gridDim,
blockDim,
0,
0,
(hipDoubleComplex*)gpu_in,
length[2],
length[1],
length[0],
stride[2],
stride[1],
stride[0],
dist,
batch,
(ilength[2] + 1) / 2 - 1,
ilength[1] - 1,
(ilength[1] + 1) / 2 - 1,
length[2] % 2 == 0,
length[1] % 2 == 0,
length[0] % 2 == 0);
break;
}
default:
throw std::runtime_error("Invalid dimension");
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("impose_hermitian_symmetry_interleaved kernel launch failure: "
+ std::string(hipGetErrorName(err)));
}
// Initialize the Hermitian complex input buffer where the data has lengths given in length, the
// transform has lengths given in length and stride given in stride. The device buffer is assumed
// to have been allocated.
void init_hermitiancomplex_cm(const std::vector& length,
const std::vector& ilength,
const std::vector& stride,
void* gpu_in)
{
size_t blockSize = 256;
const dim3 blockdim(blockSize);
switch(length.size())
{
case 1:
{
const dim3 griddim(ceildiv(ilength[0], blockSize));
hipLaunchKernelGGL(
initcdata1, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]);
break;
}
case 2:
{
const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y));
hipLaunchKernelGGL(initcdata2,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
ilength[0],
ilength[1],
stride[0],
stride[1]);
break;
}
case 3:
{
const dim3 griddim(ceildiv(ilength[0], blockdim.x),
ceildiv(ilength[1], blockdim.y),
ceildiv(ilength[2], blockdim.z));
hipLaunchKernelGGL(initcdata3,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
ilength[0],
ilength[1],
ilength[2],
stride[0],
stride[1],
stride[2]);
break;
}
default:
throw std::runtime_error("Invalid dimension");
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("init_complex_data kernel launch failure: "
+ std::string(hipGetErrorName(err)));
impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in);
}
#endif /* EXAMPLEKERNELS_H */
rocFFT-rocm-6.4.3/clients/samples/rocfft/exampleutils.h 0000664 0000000 0000000 00000013644 15015373413 0023074 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef EXAMPLEUTILS_H
#define EXAMPLEUTILS_H
std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c)
{
stream << "(" << c.x << "," << c.y << ")";
return stream;
}
// Increment the index (column-major) for looping over arbitrary dimensional loops with
// dimensions length.
template
bool increment_cm(std::vector& index, const std::vector& length)
{
for(unsigned int idim = 0; idim < length.size(); ++idim)
{
if(index[idim] < length[idim])
{
if(++index[idim] == length[idim])
{
index[idim] = 0;
continue;
}
break;
}
}
// End the loop when we get back to the start:
return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
}
// Output a formatted general-dimensional array with given length and stride in batches
// separated by dist, in column-major order.
template
void printbuffer_cm(const std::vector& data,
const std::vector& length,
const std::vector& stride,
const size_t nbatch,
const size_t dist)
{
for(size_t b = 0; b < nbatch; b++)
{
std::vector index(length.size());
std::fill(index.begin(), index.end(), 0);
do
{
const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist);
assert(i >= 0);
assert(i < data.size());
std::cout << data[i] << " ";
for(size_t idx = 0; idx < index.size(); ++idx)
{
if(index[idx] == (length[idx] - 1))
{
std::cout << "\n";
}
else
{
break;
}
}
} while(increment_cm(index, length));
std::cout << std::endl;
}
}
// Check that an multi-dimensional array of complex values with dimensions length
// and straide stride, with nbatch copies separated by dist is Hermitian-symmetric.
// Column-major version.
template
bool check_symmetry_cm(const std::vector& data,
const std::vector& length_cm,
const std::vector& stride_cm,
const size_t nbatch,
const size_t dist,
const bool verbose = true)
{
bool issymmetric = true;
for(size_t b = 0; b < nbatch; b++)
{
std::vector index(length_cm.size());
std::fill(index.begin(), index.end(), 0);
do
{
bool skip = false;
std::vector negindex(index.size());
for(size_t idx = 0; idx < index.size(); ++idx)
{
if(index[0] > length_cm[0] / 2)
{
skip = true;
break;
}
negindex[idx] = (length_cm[idx] - index[idx]) % length_cm[idx];
}
if(negindex[0] > length_cm[0] / 2)
{
skip = true;
}
if(!skip)
{
const auto i
= std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist);
const auto j = std::inner_product(
negindex.begin(), negindex.end(), stride_cm.begin(), b * dist);
if((data[i].x != data[j].x) or (data[i].y != -data[j].y))
{
if(verbose)
{
std::cout << "(";
std::string separator;
for(auto val : index)
{
std::cout << separator << val;
separator = ",";
}
std::cout << ")->";
std::cout << i << "\t";
std::cout << "(";
separator = "";
for(auto val : negindex)
{
std::cout << separator << val;
separator = ",";
}
std::cout << ")->";
std::cout << j << ":\t";
std::cout << data[i] << " " << data[j];
std::cout << "\tnot conjugate!" << std::endl;
}
issymmetric = false;
}
}
} while(increment_cm(index, length_cm));
}
return issymmetric;
}
#endif /* EXAMPLEUTILS_H */
rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_callback.cpp 0000664 0000000 0000000 00000015670 15015373413 0025526 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
#include
#include
#include
// example of using load/store callbacks with rocfft
struct load_cbdata
{
double2* filter;
double scale;
};
__device__ double2 load_callback(double2* input, size_t offset, void* cbdata, void* sharedMem)
{
auto data = static_cast(cbdata);
// multiply each element by filter element and scale
return hipCmul(hipCmul(input[offset], data->filter[offset]),
make_hipDoubleComplex(data->scale, data->scale));
}
__device__ auto load_callback_dev = load_callback;
int main()
{
const size_t N = 8;
std::vector cx(N), filter(N);
// initialize data and filter
for(size_t i = 0; i < N; i++)
{
cx[i].x = i;
cx[i].y = i;
filter[i].x = rand() / static_cast(RAND_MAX);
filter[i].y = 0;
}
// rocfft gpu compute
// ==================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(double2);
// Create HIP device object.
double2 *x, *filter_dev;
// create buffers
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(hipMalloc(&filter_dev, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = nullptr;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
nullptr)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(work_buf_size)
{
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Prepare callback
load_cbdata cbdata_host;
cbdata_host.filter = filter_dev;
cbdata_host.scale = 1.0 / static_cast(N);
void* cbdata_dev;
if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Get a properly-typed host pointer to the device function, as
// rocfft_execution_info_set_load_callback expects void*.
void* cbptr_host = nullptr;
hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*));
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpyFromSymbol failed.");
// set callback
if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_load_callback failed.");
// Execute plan
if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector y(N);
hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(cbdata_dev) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(hipFree(filter_dev) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_complexcomplex.cpp 0000664 0000000 0000000 00000024541 15015373413 0027026 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include
#include
#include
#include
#include "../../../shared/CLI11.hpp"
#include "examplekernels.h"
#include "exampleutils.h"
#include
int main(int argc, char* argv[])
{
std::cout << "rocfft double-precision complex-to-complex transform\n" << std::endl;
// Length of transform:
std::vector length = {8};
// Gpu device id:
size_t deviceId = 0;
// Command-line options:
CLI::App app{"rocfft sample command line options"};
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
CLI::Option* opt_outofplace
= app.add_flag("-o, --outofplace", "Perform an out-of-place transform");
CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform");
app.add_option(
"--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)");
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
// Placeness for the transform
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
const rocfft_result_placement place
= *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace;
const bool inplace = place == rocfft_placement_inplace;
// Direction of transform
const rocfft_transform_type direction = *opt_inverse ? rocfft_transform_type_complex_forward
: rocfft_transform_type_complex_inverse;
// Set up the strides and buffer size for the input:
std::vector istride = {1};
for(unsigned int i = 1; i < length.size(); ++i)
{
istride.push_back(length[i - 1] * istride[i - 1]);
}
const size_t isize = length[length.size() - 1] * istride[istride.size() - 1];
// Set up the strides and buffer size for the output:
std::vector ostride = {1};
for(unsigned int i = 1; i < length.size(); ++i)
{
ostride.push_back(length[i - 1] * ostride[i - 1]);
}
const size_t osize = length[length.size() - 1] * ostride[ostride.size() - 1];
// Print information about the transform:
std::cout << "direction: ";
if(direction == rocfft_transform_type_complex_forward)
std::cout << "forward\n";
else
std::cout << "inverse\n";
std::cout << "length:";
for(const auto i : length)
std::cout << " " << i;
std::cout << "\n";
if(inplace)
std::cout << "in-place transform\n";
else
std::cout << "out-of-place transform\n";
std::cout << "deviceID: " << deviceId << "\n";
std::cout << "input strides:";
for(auto i : istride)
std::cout << " " << i;
std::cout << "\n";
std::cout << "output strides:";
for(auto i : ostride)
std::cout << " " << i;
std::cout << "\n";
std::cout << "input size: " << isize << "\n";
std::cout << "output size: " << isize << "\n";
std::cout << std::endl;
// Set the device:
if(hipSetDevice(deviceId) != hipSuccess)
throw std::runtime_error("hipSetDevice failed.");
// Create HIP device object and allocate data
hipDoubleComplex* gpu_in = nullptr;
if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Inititalize the data on the device
initcomplex_cm(length, istride, gpu_in);
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
hipError_t hip_status = hipGetLastError();
if(hip_status != hipSuccess)
throw std::runtime_error("device error");
std::cout << "input:\n";
std::vector idata(isize);
hip_status
= hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(idata, length, istride, 1, isize);
// Create the a descrition struct to set data layout:
rocfft_plan_description gpu_description = nullptr;
// rocfft_status can be used to capture API status info
rocfft_status rc = rocfft_plan_description_create(&gpu_description);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create plan description");
rc = rocfft_plan_description_set_data_layout(gpu_description,
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved,
nullptr,
nullptr,
istride.size(), // input stride length
istride.data(), // input stride data
0, // input batch distance
ostride.size(), // output stride length
ostride.data(), // output stride data
0); // ouptut batch distance
if(rc != rocfft_status_success)
throw std::runtime_error("failed to set data layout");
// We can also pass "nullptr" instead of a description; rocFFT will use reasonable
// default parameters. If the data isn't contiguous, we need to set strides, etc,
// using the description.
// Create the plan
rocfft_plan gpu_plan = nullptr;
rc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
gpu_description); // Description
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create plan");
// Get the execution info for the fft plan (in particular, work memory requirements):
rocfft_execution_info planinfo = nullptr;
rc = rocfft_execution_info_create(&planinfo);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create execution info");
size_t workbuffersize = 0;
rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to get work buffer size");
// If the transform requires work memory, allocate a work buffer:
void* wbuffer = nullptr;
if(workbuffersize > 0)
{
hip_status = hipMalloc(&wbuffer, workbuffersize);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to set work buffer.");
}
// If the transform is out-of-place, allocate the output buffer as well:
double2* gpu_out = inplace ? gpu_in : nullptr;
if(!inplace)
{
hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex));
if(hip_status != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
}
// Execute the GPU transform:
rc = rocfft_execute(gpu_plan, // plan
(void**)&gpu_in, // in_buffer
(void**)&gpu_out, // out_buffer
planinfo); // execution info
if(rc != rocfft_status_success)
throw std::runtime_error("failed to execute.");
// Get the output from the device and print to cout:
std::cout << "output:\n";
std::vector odata(osize);
hip_status
= hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(odata, length, istride, 1, isize);
// Clean up: free GPU memory:
if(hipFree(gpu_in) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(!inplace)
{
if(hipFree(gpu_out) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
if(wbuffer != nullptr)
{
if(hipFree(wbuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
// Clean up: destroy plans:
if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
planinfo = nullptr;
if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_description_destroy failed.");
gpu_description = nullptr;
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_realcomplex.cpp 0000664 0000000 0000000 00000027731 15015373413 0026306 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include
#include
#include
#include
#include "../../../shared/CLI11.hpp"
#include "examplekernels.h"
#include "exampleutils.h"
#include
int main(int argc, char* argv[])
{
std::cout << "rocfft double-precision real/complex transform\n" << std::endl;
// Length of transform:
std::vector length = {8};
// Gpu device id:
size_t deviceId = 0;
// Command-line options:
CLI::App app{"rocfft sample command line options"};
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
CLI::Option* opt_outofplace
= app.add_flag("-o, --outofplace", "Perform an out-of-place transform");
CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform");
app.add_option(
"--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)");
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
// Placeness for the transform
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
const rocfft_result_placement place
= *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace;
const bool inplace = place == rocfft_placement_inplace;
// Direction of transform
const rocfft_transform_type direction
= *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward;
const bool forward = direction == rocfft_transform_type_real_forward;
// Set up the strides and buffer size for the real values:
std::vector rstride = {1};
for(unsigned int i = 1; i < length.size(); ++i)
{
// In-place transforms need space for two extra real values in the contiguous
// direction.
auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1];
rstride.push_back(val);
}
// NB: not tight, but hey
const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1];
std::vector rdata(real_size); // host storage
// The complex data length is half + 1 of the real data length in the contiguous
// dimensions. Since rocFFT is column-major, this is the first index.
std::vector clength = length;
clength[0] = clength[0] / 2 + 1;
std::vector cstride = {1};
for(unsigned int i = 1; i < clength.size(); ++i)
{
cstride.push_back(clength[i - 1] * cstride[i - 1]);
}
const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1];
std::vector cdata(complex_size); // host storage
// Based on the direction, we set the input and output parameters appropriately.
const size_t isize = forward ? real_size : complex_size;
const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex));
const std::vector ilength = forward ? length : clength;
const std::vector istride = forward ? rstride : cstride;
const size_t osize = forward ? complex_size : real_size;
const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double));
const std::vector olength = forward ? clength : length;
const std::vector ostride = forward ? cstride : rstride;
// Print information about the transform:
std::cout << "direction: ";
if(forward)
std::cout << "forward\n";
else
std::cout << "inverse\n";
std::cout << "length:";
for(const auto i : length)
std::cout << " " << i;
std::cout << "\n";
if(inplace)
std::cout << "in-place transform\n";
else
std::cout << "out-of-place transform\n";
std::cout << "deviceID: " << deviceId << "\n";
std::cout << "input length:";
for(auto i : ilength)
std::cout << " " << i;
std::cout << "\n";
std::cout << "input buffer stride:";
for(auto i : istride)
std::cout << " " << i;
std::cout << "\n";
std::cout << "input buffer size: " << ibytes << "\n";
std::cout << "output length:";
for(auto i : olength)
std::cout << " " << i;
std::cout << "\n";
std::cout << "output buffer stride:";
for(auto i : ostride)
std::cout << " " << i;
std::cout << "\n";
std::cout << "output buffer size: " << obytes << "\n";
std::cout << std::endl;
// Set the device:
if(hipSetDevice(deviceId) != hipSuccess)
throw std::runtime_error("hipSetDevice failed.");
// Create HIP device object and initialize data
// Kernels are provided in examplekernels.h
void* gpu_in = nullptr;
hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes);
if(hip_status != hipSuccess)
throw std::runtime_error("device error");
if(forward)
{
initreal_cm(length, istride, gpu_in);
}
else
{
init_hermitiancomplex_cm(length, ilength, istride, gpu_in);
}
// Print the input:
std::cout << "input:\n";
if(forward)
{
hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(rdata, ilength, istride, 1, isize);
}
else
{
hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(cdata, ilength, istride, 1, isize);
// Check that the buffer is Hermitian symmetric:
check_symmetry_cm(cdata, length, istride, 1, isize);
}
// rocfft_status can be used to capture API status info
rocfft_status rc = rocfft_status_success;
// Create the a descrition struct to set data layout:
rocfft_plan_description gpu_description = nullptr;
rc = rocfft_plan_description_create(&gpu_description);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create plan description");
rc = rocfft_plan_description_set_data_layout(
gpu_description,
// input data format:
forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
// output data format:
forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
nullptr,
nullptr,
istride.size(), // input stride length
istride.data(), // input stride data
0, // input batch distance
ostride.size(), // output stride length
ostride.data(), // output stride data
0); // ouptut batch distance
if(rc != rocfft_status_success)
throw std::runtime_error("failed to set data layout");
// We can also pass "nullptr" instead of a description; rocFFT will use reasonable
// default parameters. If the data isn't contiguous, we need to set strides, etc,
// using the description.
// Create the FFT plan:
rocfft_plan gpu_plan = nullptr;
rc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
gpu_description); // Description
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create plan");
// Get the execution info for the fft plan (in particular, work memory requirements):
rocfft_execution_info planinfo = nullptr;
rc = rocfft_execution_info_create(&planinfo);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create execution info");
size_t workbuffersize = 0;
rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to get work buffer size");
// If the transform requires work memory, allocate a work buffer:
void* wbuffer = nullptr;
if(workbuffersize > 0)
{
hip_status = hipMalloc(&wbuffer, workbuffersize);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMalloc failed");
rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to set work buffer");
}
// If the transform is out-of-place, allocate the output buffer as well:
void* gpu_out = inplace ? gpu_in : nullptr;
if(!inplace)
{
hip_status = hipMalloc(&gpu_out, obytes);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMalloc failed");
}
// Execute the GPU transform:
rc = rocfft_execute(gpu_plan, // plan
(void**)&gpu_in, // in_buffer
(void**)&gpu_out, // out_buffer
planinfo); // execution info
if(rc != rocfft_status_success)
throw std::runtime_error("failed to execute");
// Get the output from the device and print to cout:
std::cout << "output:\n";
if(forward)
{
hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(cdata, olength, ostride, 1, osize);
}
else
{
hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(rdata, olength, ostride, 1, osize);
}
// Clean up: free GPU memory:
if(hipFree(gpu_in) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(!inplace)
{
if(hipFree(gpu_out) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
if(wbuffer != nullptr)
{
if(hipFree(wbuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
// Clean up: destroy plans:
if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
planinfo = nullptr;
if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_description_destroy failed.");
gpu_description = nullptr;
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
rocfft_cleanup();
return 0;
}
rocFFT-rocm-6.4.3/clients/samples/rocfft/rocfft_example_set_stream.cpp 0000664 0000000 0000000 00000012647 15015373413 0026141 0 ustar 00root root 0000000 0000000 // Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "rocfft/rocfft.h"
#include
#include
#include
#include
#include
struct fft_fixture_t
{
std::vector cpu_buf;
double2* gpu_buf = nullptr;
hipStream_t stream = nullptr;
rocfft_execution_info info = nullptr;
rocfft_plan plan = nullptr;
};
int main(int argc, char* argv[])
{
std::cout << "rocfft example of 2 inplace transforms with 2 streams.\n" << std::endl;
size_t length = 8;
size_t total_bytes = length * sizeof(double2);
hipError_t hip_status;
rocfft_status fft_status;
fft_fixture_t ffts[2];
/// preparation
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
for(auto& it : ffts)
{
// create cpu buffer
it.cpu_buf.resize(length);
// init cpu buffer...
// create gpu buffer
if(hipMalloc(&(it.gpu_buf), total_bytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// copy host to device
if(hipMemcpy(it.gpu_buf, it.cpu_buf.data(), total_bytes, hipMemcpyHostToDevice)
!= hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// create stream
if(hipStreamCreate(&(it.stream)) != hipSuccess)
throw std::runtime_error("hipStreamCreate failed.");
// create execution info
fft_status = rocfft_execution_info_create(&(it.info));
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
// set stream
// NOTE: The stream must be of type hipStream_t.
// It is an error to pass the address of a hipStream_t object.
fft_status = rocfft_execution_info_set_stream(it.info, it.stream);
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_stream failed.");
// create plan
fft_status = rocfft_plan_create(&it.plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
nullptr);
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
size_t work_buf_size = 0;
fft_status = rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size);
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
assert(work_buf_size == 0); // simple 1D inplace fft doesn't need extra working buffer
}
/// execution
for(auto& it : ffts)
{
fft_status = rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr);
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
}
/// wait and copy back
for(auto& it : ffts)
{
if(hipStreamSynchronize(it.stream) != hipSuccess)
throw std::runtime_error("hipStreamSynchronize failed.");
hip_status = hipMemcpy(it.cpu_buf.data(), it.gpu_buf, total_bytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
}
/// clean up
for(auto& it : ffts)
{
fft_status = rocfft_plan_destroy(it.plan);
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
fft_status = rocfft_execution_info_destroy(it.info);
if(fft_status != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
if(hipStreamDestroy(it.stream) != hipSuccess)
throw std::runtime_error("hipStreamDestroy failed.");
if(hipFree(it.gpu_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-6.4.3/clients/tests/ 0000775 0000000 0000000 00000000000 15015373413 0016412 5 ustar 00root root 0000000 0000000 rocFFT-rocm-6.4.3/clients/tests/CMakeLists.txt 0000664 0000000 0000000 00000030532 15015373413 0021155 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-tests LANGUAGES CXX )
set(CMAKE_CXX_STANDARD 17)
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( HIP REQUIRED )
endif()
if( NOT ROCM_FOUND )
find_package( ROCM 0.7.3 REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
include( ROCMInstallTargets )
set( rocfft-test_source
gtest_main.cpp
rocfft_accuracy_test.cpp
bitwise_repro/bitwise_repro_test.cpp
accuracy_test.cpp
accuracy_test_1D.cpp
accuracy_test_2D.cpp
accuracy_test_3D.cpp
accuracy_test_adhoc.cpp
accuracy_test_emulation.cpp
accuracy_test_callback.cpp
accuracy_test_checkstride.cpp
multithread_test.cpp
multi_device_test.cpp
hermitian_test.cpp
hipGraph_test.cpp
callback_change_type.cpp
default_callbacks_test.cpp
unit_test.cpp
buffer_hash_test.cpp
validate_length_stride.cpp
random.cpp
../../shared/array_validator.cpp
)
add_executable( rocfft-test ${rocfft-test_source} ${rocfft-test_includes} )
add_executable( rtc_helper_crash rtc_helper_crash.cpp )
find_package( Boost REQUIRED )
set( Boost_DEBUG ON )
set( Boost_DETAILED_FAILURE_MSG ON )
option( BUILD_FFTW "Download and build FFTW" OFF )
# look for installed FFTW if we weren't asked to build it
if( NOT BUILD_FFTW )
find_package( FFTW 3.0 MODULE COMPONENTS FLOAT DOUBLE )
endif()
include( ExternalProject )
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24)
# use extract timestamp for fetched files instead of timestamps in the archive
cmake_policy(SET CMP0135 NEW)
endif()
# also try to build FFTW if FFTW isn't present
if( BUILD_FFTW OR NOT FFTW_FOUND )
set(FFTW_LIBRARIES_DOUBLE
${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3_threads${CMAKE_SHARED_LIBRARY_SUFFIX}
${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3${CMAKE_SHARED_LIBRARY_SUFFIX})
set(FFTW_LIBRARIES_SINGLE
${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f_threads${CMAKE_SHARED_LIBRARY_SUFFIX}
${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f${CMAKE_SHARED_LIBRARY_SUFFIX})
set(FFTW_CMAKE_ARGS_COMMON
-DDISABLE_FORTRAN=ON
-DENABLE_AVX2=ON
-DENABLE_THREADS=ON
-DBUILD_SHARED_LIBS=ON
-DBUILD_TESTS=OFF
-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER})
set(FFTW_SRC_URL http://www.fftw.org/fftw-3.3.9.tar.gz CACHE STRING "Location of FFTW source code")
set(FFTW_SRC_SHA256 bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d CACHE STRING "SHA256 hash of FFTW source code")
# build double-precision FFTW
ExternalProject_Add(fftw_double
URL ${FFTW_SRC_URL}
URL_HASH SHA256=${FFTW_SRC_SHA256}
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON}
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${FFTW_LIBRARIES_DOUBLE})
ExternalProject_Get_Property( fftw_double source_dir binary_dir )
# also build single-precision fftw from the same source dir
ExternalProject_Add(fftw_single
DOWNLOAD_COMMAND ""
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} -DENABLE_FLOAT=ON
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${FFTW_LIBRARIES_SINGLE}
DEPENDS fftw_double)
ExternalProject_Get_Property( fftw_single source_dir binary_dir )
set(FFTW_INCLUDES
${CMAKE_CURRENT_BINARY_DIR}/src/fftw/api)
set(FFTW_LIBRARIES
${FFTW_LIBRARIES_DOUBLE}
${FFTW_LIBRARIES_SINGLE})
# FFTW we build is always threaded
set( FFTW_MULTITHREAD TRUE )
add_dependencies( rocfft-test fftw_double fftw_single )
rocm_install(
FILES ${FFTW_LIBRARIES}
DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw
COMPONENT clients-common
)
else()
include_directories(${FFTW_INCLUDE_DIRS})
endif()
set( rocfft-test_include_dirs
$
$
$
${ROCM_CLANG_ROOT}/include
)
set( rocfft-test_link_libs
${FFTW_LIBRARIES}
)
include( ../cmake/build-gtest.cmake )
if( BUILD_GTEST OR NOT GTEST_FOUND )
add_dependencies( rocfft-test gtest )
list( APPEND rocfft-test_include_dirs ${GTEST_INCLUDE_DIRS} )
list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} )
else()
list( APPEND rocfft-test_include_dirs $ )
list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} )
endif()
target_compile_options( rocfft-test PRIVATE ${WARNING_FLAGS} -Wno-cpp )
target_include_directories( rocfft-test
PRIVATE
${rocfft-test_include_dirs}
)
if( NOT BUILD_SHARED_LIBS )
list(APPEND rocfft-test_link_libs ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS})
endif()
if( NOT ROCFFT_BUILD_SCOPE )
find_package(SQLite3 REQUIRED)
set( ROCFFT_SQLITE_LIB SQLite::SQLite3)
endif()
target_link_libraries( rocfft-test
PRIVATE
hip::device
roc::rocfft
${ROCFFT_SQLITE_LIB}
${rocfft-test_link_libs}
)
if ( USE_HIPRAND )
target_link_libraries( rocfft-test
PRIVATE
hip::hiprand
)
endif()
if( ROCFFT_MPI_ENABLE )
target_link_libraries( rocfft-test
PRIVATE
MPI::MPI_CXX
)
add_compile_definitions( ROCFFT_MPI_ENABLE )
if ( ROCFFT_CRAY_MPI_ENABLE )
target_link_libraries( rocfft-test
PRIVATE
"mpi_gtl_hsa"
)
get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY )
target_link_directories( rocfft-test
PRIVATE
${MPI_LIBDIR}/../../../../gtl/lib )
endif()
endif()
include( ../../cmake/std-filesystem.cmake )
target_link_std_experimental_filesystem( rocfft-test )
if( USE_CUDA )
target_include_directories( rocfft-test
PRIVATE
$
$
)
target_compile_definitions( rocfft-test PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( rocfft-test PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
include( ../../cmake/sqlite.cmake )
target_link_libraries( rocfft-test PUBLIC ${ROCFFT_SQLITE_LIB} )
target_include_directories( rocfft-test PRIVATE ${sqlite_local_SOURCE_DIR} )
set_property( TARGET rocfft-test APPEND PROPERTY LINK_LIBRARIES ${ROCFFT_SQLITE_LIB} )
option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON )
if( BUILD_CLIENTS_TESTS_OPENMP )
if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
target_compile_options( rocfft-test PRIVATE -fopenmp )
target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib )
target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include )
else()
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
target_compile_options( rocfft-test PRIVATE -fopenmp=libomp )
target_link_options( rocfft-test PRIVATE -fopenmp=libomp )
endif()
endif()
endif()
if(FFTW_MULTITHREAD)
target_compile_options( rocfft-test PRIVATE -DFFTW_MULTITHREAD )
endif( )
set_target_properties( rocfft-test PROPERTIES
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( TESTS_OUT_DIR "/../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( TESTS_OUT_DIR "/../bin" )
else()
set( TESTS_OUT_DIR "/bin" )
endif()
string( CONCAT TESTS_OUT_DIR "${PROJECT_BINARY_DIR}" ${TESTS_OUT_DIR} )
set_target_properties(rocfft-test
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${TESTS_OUT_DIR})
set_target_properties(rtc_helper_crash
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${TESTS_OUT_DIR})
rocm_install(TARGETS rocfft-test rtc_helper_crash COMPONENT tests)
if (WIN32)
# Ensure tests run with HIP DLLs and not anything the driver owns
# in system32. Libraries like amdhip64.dll are also in the HIP
# runtime, and we need run with those. But the only way to make a
# same-named DLL override something in system32 is to have it next
# to the executable. So copy them in.
file( GLOB third_party_dlls
LIST_DIRECTORIES OFF
CONFIGURE_DEPENDS
${HIP_DIR}/bin/*.dll
C:/Windows/System32/libomp140*.dll
)
foreach( file_i ${third_party_dlls})
add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ )
endforeach( file_i )
endif()
if( ROCFFT_MPI_ENABLE )
# normal and dynamic-loading MPI worker processes
foreach(worker rocfft_mpi_worker dyna_rocfft_mpi_worker)
add_executable( ${worker} rocfft_mpi_worker.cpp )
target_compile_options( ${worker} PRIVATE -fopenmp )
target_include_directories( ${worker}
PRIVATE
${CMAKE_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/
${MPI_C_INCLUDE_PATH}
$
)
target_compile_options( ${worker} PRIVATE ${WARNING_FLAGS} )
if ( ROCFFT_CRAY_MPI_ENABLE )
target_link_libraries( ${worker}
-fopenmp
hip::hiprand
hip::device
MPI::MPI_CXX
${FFTW_LIBRARIES}
"mpi_gtl_hsa"
)
get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY )
target_link_directories( ${worker}
PRIVATE
${MPI_LIBDIR}/../../../../gtl/lib )
else()
target_link_libraries( ${worker}
-fopenmp
hip::hiprand
hip::device
MPI::MPI_CXX
${FFTW_LIBRARIES}
)
endif()
set_target_properties(${worker}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${TESTS_OUT_DIR})
rocm_install(TARGETS ${worker} COMPONENT tests)
endforeach()
# link normal MPI worker against rocFFT
target_link_libraries( rocfft_mpi_worker
roc::rocfft
)
# dyna worker only needs to dynamically load libraries
target_compile_definitions( dyna_rocfft_mpi_worker PRIVATE ROCFFT_DYNA_MPI_WORKER )
target_link_libraries( dyna_rocfft_mpi_worker
${CMAKE_DL_LIBS}
)
endif()
rocFFT-rocm-6.4.3/clients/tests/accuracy_test.cpp 0000664 0000000 0000000 00000062474 15015373413 0021764 0 ustar 00root root 0000000 0000000 // Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "../../shared/accuracy_test.h"
#include "../../shared/rocfft_complex.h"
#include