pax_global_header00006660000000000000000000000064150320224050014503gustar00rootroot0000000000000052 comment=1b897f56e0e972c6263018f5b26522ba7869b0a8 rocSOLVER-rocm-6.4.3/000077500000000000000000000000001503202240500141715ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/.azuredevops/000077500000000000000000000000001503202240500166165ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/.azuredevops/rocm-ci.yml000066400000000000000000000012031503202240500206660ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .github - .jenkins - docs - '.*.y*ml' - '*.md' pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .github - .jenkins - docs - '.*.y*ml' - '*.md' drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/rocSOLVER.yml@pipelines_repo rocSOLVER-rocm-6.4.3/.clang-format000077500000000000000000000065421503202240500165560ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Left AlignOperands: false AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: true BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: Inner #ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 10 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- rocSOLVER-rocm-6.4.3/.gitattributes000066400000000000000000000004471503202240500170710ustar00rootroot00000000000000# By default, convert all text files to Unix line endings on check-in # and native line endings on check-out * text=auto # Override the default behavior for specific files *.sh text eol=lf *.bat text eol=crlf # Reduce merge conflicts in changelog /CHANGELOG.md merge=union rocSOLVER-rocm-6.4.3/.github/000077500000000000000000000000001503202240500155315ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/.github/CODEOWNERS000066400000000000000000000011161503202240500171230ustar00rootroot00000000000000* @jzuniga-amd @tfalders @cgmb @qjojo @EdDAzevedo @jmachado-amd @AGonzales-amd # Documentation files docs/* @ROCm/rocm-documentation @jzuniga-amd @tfalders @cgmb @qjojo @EdDAzevedo @jmachado-amd @AGonzales-amd *.md @ROCm/rocm-documentation @jzuniga-amd @tfalders @cgmb @qjojo @EdDAzevedo @jmachado-amd @AGonzales-amd *.rst @ROCm/rocm-documentation @jzuniga-amd @tfalders @cgmb @qjojo @EdDAzevedo @jmachado-amd @AGonzales-amd # Header directory for Doxygen documentation library/include/* @ROCm/rocm-documentation @jzuniga-amd @tfalders @cgmb @qjojo @EdDAzevedo @jmachado-amd @AGonzales-amd rocSOLVER-rocm-6.4.3/.github/dependabot.yml000066400000000000000000000012231503202240500203570ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" rocSOLVER-rocm-6.4.3/.gitignore000066400000000000000000000007071503202240500161650ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Editors .vscode # build-in-source directory build/ docBin/ # emacs temporary/backup files .\#* \#*\# *~ # documentation artifacts _build/ _images/ _static/ _templates/ _toc.yml rocSOLVER-rocm-6.4.3/.jenkins/000077500000000000000000000000001503202240500157105ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/.jenkins/common.groovy000066400000000000000000000061561503202240500204570ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean sameOrg=false, boolean isStatic=false) { project.paths.construct_build_prefix() String compiler = 'hipcc' String hipClang = '' String debug = project.buildName.contains('Debug') ? '-g' : '' String centos = platform.jenkinsLabel.contains('centos') ? 'source scl_source enable devtoolset-7' : '' List options = [] Boolean withSparse = true if (env.BRANCH_NAME ==~ /PR-\d+/) { pullRequest.labels.each { if (it == "noOptimizations") { options << "-n" } else if (it == "ci:no-sparse") { options << "--no-sparse" withSparse = false } } } List getDeps = [] getDeps << auxiliary.getLibrary('hipBLAS-common', platform.jenkinsLabel, null, sameOrg) getDeps << auxiliary.getLibrary('hipBLASLt', platform.jenkinsLabel, null, sameOrg) getDeps << auxiliary.getLibrary('rocBLAS', platform.jenkinsLabel, null, sameOrg) if (withSparse) { getDeps << auxiliary.getLibrary('rocSPARSE', platform.jenkinsLabel, null, sameOrg) } getDeps << auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} ${getDeps.join('\\n')} ${auxiliary.exitIfNotSuccess()} ${centos} ${project.paths.build_command} ${hipClang} ${debug} ${options.join(' ')} ${auxiliary.exitIfNotSuccess()} """ platform.runCommand(this, command) } def runTestCommand (platform, project, gfilter) { String buildType = project.buildName.contains('Debug') ? 'debug' : 'release' String hmmTestCommand = platform.jenkinsLabel.contains('gfx90a') ? 'HSA_XNACK=1 ./rocsolver-test --gtest_filter=*MANAGED_MALLOC* || true' : '' def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${buildType}/clients/staging ./rocsolver-test --gtest_output=xml --gtest_color=yes --gtest_filter=${gfilter} if [ -f ./test-rocsolver-dlopen ]; then ./test-rocsolver-dlopen --gtest_color=yes fi ${hmmTestCommand} cd ../.. CTEST_OUTPUT_ON_FAILURE=1 ctest -R '^test-rocsolver-bench' """ platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/${buildType}/clients/staging/*.xml" } def runPackageCommand(platform, project) { String buildType = project.buildName.contains('Debug') ? 'debug' : 'release' def packageHelper = platform.makePackage(platform.jenkinsLabel, "${project.paths.project_build_prefix}/build/${buildType}") platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) } return this rocSOLVER-rocm-6.4.3/.jenkins/debug.groovy000066400000000000000000000050621503202240500202500ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'Debug') prj.timeout.compile = 600 prj.timeout.test = 45 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> def gfilter = 'checkin*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 5 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocSOLVER-rocm-6.4.3/.jenkins/extended.groovy000066400000000000000000000051341503202240500207620ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'Extended') prj.timeout.compile = 600 prj.timeout.test = 420 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> def gfilter = 'daily*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx908'],sles15sp1:['gfx906']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 4 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocSOLVER-rocm-6.4.3/.jenkins/precheckin.groovy000066400000000000000000000055671503202240500213070ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'PreCheckin') prj.timeout.compile = 600 prj.timeout.test = 45 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c --cmake-arg -DWERROR=ON' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> def gfilter = 'checkin*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 6')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 6')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu18:['gfx900'],centos7:['gfx908'],sles15sp1:['gfx906']]), "compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx908'],centos8:['gfx906'],sles15sp1:['gfx906']]), "rocm-docker":([ubuntu18:['gfx900'],centos7:['gfx908'],sles15sp1:['gfx906']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocSOLVER') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 5 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx900', 'gfx906']], urlJobName) } } } rocSOLVER-rocm-6.4.3/.jenkins/static.groovy000066400000000000000000000046521503202240500204550ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'static') prj.timeout.compile = 600 prj.timeout.test = 45 prj.defaults.ccache = true // customize for project prj.paths.build_command = './install.sh -c --cmake-arg -DWERROR=ON --static' // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, true, true) } def testCommand = { platform, project-> def gfilter = 'checkin*' commonGroovy.runTestCommand(platform, project, gfilter) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["main":[pipelineTriggers([cron('0 1 * * 6')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["main":([ubuntu22:['gfx90a']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocSOLVER') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu22:['gfx90a']], urlJobName) } } } rocSOLVER-rocm-6.4.3/.jenkins/staticanalysis.groovy000066400000000000000000000027411503202240500222160ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocSOLVER', 'StaticAnalysis') // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true boolean staticAnalysis = true buildProject(prj, formatCheck, nodes.dockerArray, null, null, null, staticAnalysis) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":[]] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } } rocSOLVER-rocm-6.4.3/.readthedocs.yaml000066400000000000000000000005021503202240500174150ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "3.10" rocSOLVER-rocm-6.4.3/CHANGELOG.md000066400000000000000000000463231503202240500160120ustar00rootroot00000000000000# Change Log for rocSOLVER Full documentation for rocSOLVER is available at the [rocSOLVER documentation](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/index.html). ## rocSOLVER 3.28.2 for ROCm 6.4.2 ### Added * Hybrid computation support for existing routines: - STERF * SVD for general matrices based on Cuppen's Divide and Conquer algorithm: - GESDD (with batched and strided\_batched versions) ### Optimized * Reduced the device memory requirements for STEDC, SYEVD/HEEVD, and SYGVD/HEGVD * Improved the performance of STEDC and divide and conquer Eigensolvers * Improved the performance of SYTRD, the initial step of the Eigensolvers that start with the tridiagonalization of the input matrix ## rocSOLVER 3.28.0 for ROCm 6.4.0 ### Added * Application of a sequence of plane rotations to a given matrix - LASR * Algorithm selection mechanism for hybrid computation * Hybrid computation support for existing routines: - BDSQR - GESVD ### Optimized * Improved the performance of SYEVJ * Improved the performance of GEQRF ## rocSOLVER 3.27.0 for ROCm 6.3.0 ### Added * 64-bit APIs for existing functions: - LACGV_64 - LARF_64 - LARFG_64 - GEQR2_64 (with batched and strided\_batched versions) - GEQRF_64 (with batched and strided\_batched versions) - POTF2_64 (with batched and strided\_batched versions) - POTRF_64 (with batched and strided\_batched versions) - POTRS_64 (with batched and strided\_batched versions) * Support added for the gfx1151, gfx1200, and gfx1201 architectures ### Changed * The rocSPARSE library is now an optional dependency at runtime. If rocSPARSE is not available, rocSOLVER's sparse refactorization and solvers functions will return `rocblas_status_not_implemented`. ### Optimized * Improved the performance of LARFG, LARF, and downstream functions such as GEQR2 and GEQRF on wave64 architectures * Improved the performance of BDSQR and GESVD * Improved the performance of STEDC and divide and conquer Eigensolvers ### Resolved issues * Fixed a memory allocation issue in SYEVJ that could cause failures on clients that manage their own memory * Fixed a synchronizarion issue with SYEVJ that could lead to a convergence failure for large matrices * Fixed a convergence issue in STEIN stemming from numerical orthogonality of the initial choice of eigenvectors * Fixed synchronization issue in STEIN ### Known issues * A known issue in STEBZ can lead to errors in routines based on bisection to compute eigenvalues for symmetric/hermitian matrices (for example, SYEVX/HEEVX and SYGVX/HEGVX), as well as singular values (for example, BDSVDX and GESVDX). ## rocSOLVER 3.26.0 for ROCm 6.2.0 ### Added * 64-bit APIs for existing functions: * GETF2_64 (with batched and strided\_batched versions) * GETRF_64 (with batched and strided\_batched versions) * GETRS_64 (with batched and strided\_batched versions) * Added gfx900 to default build targets. * Partial eigenvalue decomposition routines for symmetric/hermitian matrices using Divide & Conquer and Bisection: * SYEVDX (with batched and strided\_batched versions) * HEEVDX (with batched and strided\_batched versions) * Partial generalized symmetric/hermitian-definite eigenvalue decomposition using Divide & Conquer and Bisection: * SYGVDX (with batched and strided\_batched versions) * HEGVDX (with batched and strided\_batched versions) ### Changed * Renamed install script arguments of the form *_dir to *-path. Arguments of the form *_dir remain functional for backwards compatibility. * Functions working with arrays of size n - 1 can now accept null pointers when n = 1. ### Optimized * Improved performance of Cholesky factorization. * Improved performance of splitlu to extract the L and U triangular matrices from the result of sparse factorization matrix M, where M = (L - eye) + U. ### Resolved issues * Fixed potential accuracy degradation in SYEVJ/HEEVJ for inputs with small eigenvalues. ## rocSOLVER 3.25.0 for ROCm 6.1.0 ### Added * Eigensolver routines for symmetric/hermitian matrices using Divide & Conquer and Jacobi algorithm: * SYEVDJ (with batched and strided\_batched versions) * HEEVDJ (with batched and strided\_batched versions) * Generalized symmetric/hermitian-definite eigensolvers using Divide & Conquer and Jacobi algorithm: * SYGVDJ (with batched and strided\_batched versions) * HEGVDJ (with batched and strided\_batched versions) ### Changed * Relaxed array length requirements for GESVDX with `rocblas_srange_index`. ### Removed * Removed gfx803 and gfx900 from default build targets. ### Resolved issues * Corrected singular vector normalization in BDSVDX and GESVDX * Fixed potential memory access fault in STEIN, SYEVX/HEEVX, SYGVX/HEGVX, BDSVDX and GESVDX ## rocSOLVER 3.24.0 for ROCm 6.0.0 ### Added - Cholesky refactorization for sparse matrices - CSRRF_REFACTCHOL - Added `rocsolver_rfinfo_mode` and the ability to specify the desired refactorization routine (see `rocsolver_set_rfinfo_mode`). ### Changed - CSRRF_ANALYSIS and CSRRF_SOLVE now support sparse Cholesky factorization ## rocSOLVER 3.23.0 for ROCm 5.7.0 ### Added - LU factorization without pivoting for block tridiagonal matrices: - GEBLTTRF_NPVT now supports interleaved\_batched format - Linear system solver without pivoting for block tridiagonal matrices: - GEBLTTRS_NPVT now supports interleaved\_batched format ### Fixed - Fixed stack overflow in sparse tests on Windows ### Changed - Changed rocsolver-test sparse input data search paths to be relative to the test executable - Changed build scripts to default to compressed debug symbols in Debug builds ## rocSOLVER 3.22.0 for ROCm 5.6.0 ### Added - LU refactorization for sparse matrices - CSRRF_ANALYSIS - CSRRF_SUMLU - CSRRF_SPLITLU - CSRRF_REFACTLU - Linear system solver for sparse matrices - CSRRF_SOLVE - Added type `rocsolver_rfinfo` for use with sparse matrix routines ### Optimized - Improved the performance of BDSQR and GESVD when singular vectors are requested - Improved the performance of sorting algorithms used in different eigensolvers ### Fixed - BDSQR and GESVD should no longer hang when the input contains `NaN` or `Inf` ## rocSOLVER 3.21.0 for ROCm 5.5.0 ### Added - SVD for general matrices using Jacobi algorithm: - GESVDJ (with batched and strided\_batched versions) - LU factorization without pivoting for block tridiagonal matrices: - GEBLTTRF_NPVT (with batched and strided\_batched versions) - Linear system solver without pivoting for block tridiagonal matrices: - GEBLTTRS_NPVT (with batched and strided\_batched versions) - Product of triangular matrices - LAUUM - Added experimental hipGraph support for rocSOLVER functions ### Optimized - Improved the performance of SYEVJ/HEEVJ. ### Changed - STEDC, SYEVD/HEEVD and SYGVD/HEGVD now use fully implemented Divide and Conquer approach. ### Fixed - SYEVJ/HEEVJ should now be invariant under matrix scaling. - SYEVJ/HEEVJ should now properly output the eigenvalues when no sweeps are executed. - Fixed GETF2\_NPVT and GETRF\_NPVT input data initialization in tests and benchmarks. - Fixed rocblas missing from the dependency list of the rocsolver deb and rpm packages. ## rocSOLVER 3.20.0 for ROCm 5.4.0 ### Added - Partial SVD for bidiagonal matrices: - BDSVDX - Partial SVD for general matrices: - GESVDX (with batched and strided\_batched versions) ### Changed - Changed `ROCSOLVER_EMBED_FMT` default to `ON` for users building directly with CMake. This matches the existing default when building with install.sh or rmake.py. ## rocSOLVER 3.19.0 for ROCm 5.3.0 ### Added - Partial eigensolver routines for symmetric/hermitian matrices: - SYEVX (with batched and strided\_batched versions) - HEEVX (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite partial eigensolvers: - SYGVX (with batched and strided\_batched versions) - HEGVX (with batched and strided\_batched versions) - Eigensolver routines for symmetric/hermitian matrices using Jacobi algorithm: - SYEVJ (with batched and strided\_batched versions) - HEEVJ (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite eigensolvers using Jacobi algorithm: - SYGVJ (with batched and strided\_batched versions) - HEGVJ (with batched and strided\_batched versions) - Added --profile_kernels option to rocsolver-bench, which will include kernel calls in the profile log (if profile logging is enabled with --profile). ### Changed - Changed rocsolver-bench result labels `cpu_time` and `gpu_time` to `cpu_time_us` and `gpu_time_us`, respectively. ### Removed - Removed dependency on cblas from the rocsolver test and benchmark clients. ### Fixed - Fixed incorrect SYGS2/HEGS2, SYGST/HEGST, SYGV/HEGV, and SYGVD/HEGVD results for batch counts larger than 32. - Fixed STEIN memory access fault when nev is 0. - Fixed incorrect STEBZ results for close eigenvalues when range = index. - Fixed git unsafe repository error when building with `./install.sh -cd` as a non-root user. ## rocSOLVER 3.18.0 for ROCm 5.2.0 ### Added - Partial eigenvalue decomposition routines: - STEBZ - STEIN - Package generation for test and benchmark executables on all supported OSes using CPack. - Added tests for multi-level logging - Added tests for rocsolver-bench client - File/Folder Reorg - Added File/Folder Reorg Changes with backward compatibility support using ROCM-CMAKE wrapper functions. ### Fixed - Fixed compatibility with libfmt 8.1 ## rocSOLVER 3.17.0 for ROCm 5.1.0 ### Optimized - Optimized non-pivoting and batch cases of the LU factorization ### Fixed - Fixed missing synchronization in SYTRF with `rocblas_fill_lower` that could potentially result in incorrect pivot values. - Fixed multi-level logging output to file with the `ROCSOLVER_LOG_PATH`, `ROCSOLVER_LOG_TRACE_PATH`, `ROCSOLVER_LOG_BENCH_PATH` and `ROCSOLVER_LOG_PROFILE_PATH` environment variables. - Fixed performance regression in the batched LU factorization of tiny matrices ## rocSOLVER 3.16.0 for ROCm 5.0.0 ### Added - Symmetric matrix factorizations: - LASYF - SYTF2, SYTRF (with batched and strided\_batched versions) - Added `rocsolver_get_version_string_size` to help with version string queries - Added `rocblas_layer_mode_ex` and the ability to print kernel calls in the trace and profile logs - Expanded batched and strided\_batched sample programs. ### Optimized - Improved general performance of LU factorization - Increased parallelism of specialized kernels when compiling from source, reducing build times on multi-core systems. ### Changed - The rocsolver-test client now prints the rocSOLVER version used to run the tests, rather than the version used to build them - The rocsolver-bench client now prints the rocSOLVER version used in the benchmark ### Fixed - Added missing stdint.h include to rocsolver.h ## rocSOLVER 3.15.0 for ROCm 4.5.0 ### Added - Eigensolver routines for symmetric/hermitian matrices using Divide and Conquer algorithm: - STEDC - SYEVD (with batched and strided\_batched versions) - HEEVD (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite eigensolvers using Divide and Conquer algorithm: - SYGVD (with batched and strided\_batched versions) - HEGVD (with batched and strided\_batched versions) - Added --mem\_query option to rocsolver-bench, which will print the amount of device memory required by a function. - Added --profile option to rocsolver-bench, which will print profile logging results for a function. - RQ factorization routines: - GERQ2, GERQF (with batched and strided\_batched versions) - Linear solvers for general square systems: - GESV (with batched and strided\_batched versions) - Linear solvers for symmetric/hermitian positive definite systems: - POTRS (with batched and strided\_batched versions) - POSV (with batched and strided\_batched versions) - Inverse of symmetric/hermitian positive definite matrices: - POTRI (with batched and strided\_batched versions) - General matrix inversion without pivoting: - GETRI\_NPVT (with batched and strided\_batched versions) - GETRI\_NPVT\_OUTOFPLACE (with batched and strided\_batched versions) ### Optimized - Improved performance of LU factorization (especially for large matrix sizes) ### Changed - The -h option of install.sh now prints a help message, instead of doing nothing. - libfmt 7.1 is now a dependency - Raised minimum requirement for building rocSOLVER from source to CMake 3.13 - Raised reference LAPACK version used for rocSOLVER test and benchmark clients to v3.9.1 - Minor CMake improvements for users building from source without install.sh: - Removed fmt::fmt from rocsolver's public usage requirements - Enabled small-size optimizations by default - Split packaging into a runtime package ('rocsolver') and a development package ('rocsolver-devel'). The development package depends on the runtime package. To aid in the transition, the runtime package suggests the development package (except on CentOS 7). This use of the suggests feature is deprecated and will be removed in a future ROCm release. ### Fixed - Use of the GCC / Clang `__attribute__((deprecated(...)))` extension is now guarded by compiler detection macros. ## rocSOLVER 3.13.0 for ROCm 4.3.0 ### Added - Linear solvers for general non-square systems: - GELS now supports underdetermined and transposed cases - Inverse of triangular matrices - TRTRI (with batched and strided\_batched versions) - Out-of-place general matrix inversion - GETRI\_OUTOFPLACE (with batched and strided\_batched versions) ### Optimized - Improved general performance of matrix inversion (GETRI) ### Changed - Argument names for the benchmark client now match argument names from the public API ### Fixed - Fixed known issues with Thin-SVD. The problem was identified in the test specification, not in the thin-SVD implementation or the rocBLAS gemm\_batched routines. - Benchmark client will no longer crash as a result of leading dimension or stride arguments not being provided on the command line. ## rocSOLVER 3.12.0 for ROCm 4.2.0 ### Added - Multi-level logging functionality - Implementation of the Thin-SVD algorithm - Reductions of generalized symmetric- and hermitian-definite eigenproblems: - SYGS2, SYGST (with batched and strided\_batched versions) - HEGS2, HEGST (with batched and strided\_batched versions) - Symmetric and hermitian matrix eigensolvers: - SYEV (with batched and strided\_batched versions) - HEEV (with batched and strided\_batched versions) - Generalized symmetric- and hermitian-definite eigensolvers: - SYGV (with batched and strided\_batched versions) - HEGV (with batched and strided\_batched versions) ### Changed - Sorting method in STERF as original quick-sort was failing for large sizes. ### Removed - Removed hcc compiler support ### Fixed - Fixed GELS overwriting B even when info != 0 - Error when calling STEQR with n=1 from batched routines - Added `roc::rocblas` to the `roc::rocsolver` CMake usage requirements - Added rocblas to the dependency list of the rocsolver deb and rpm packages - Fixed rocblas symbol loading with dlopen and the `RTLD_NOW | RTLD_LOCAL` options ### Known Issues - Thin-SVD implementation is failing in some cases (in particular m=300, n=120) due to a possible bug in the gemm\_batched routines of rocBLAS. ## rocSOLVER 3.11.0 for ROCm 4.1.0 ### Added - Eigensolver routines for symmetric/hermitian matrices: - STERF, STEQR - Linear solvers for general non-square systems: - GELS (API added with batched and strided\_batched versions. Only the overdetermined non-transpose case is implemented in this release. Other cases will return `rocblas_status_not_implemented` status for now.) - Extended test coverage for functions returning info - Changelog file - Tridiagonalization routines for symmetric and hermitian matrices: - LATRD - SYTD2, SYTRD (with batched and strided\_batched versions) - HETD2, HETRD (with batched and strided\_batched versions) - Sample code and unit test for unified memory model/Heterogeneous Memory Management (HMM) ### Optimized - Improved performance of LU factorization of small and mid-size matrices (n <= 2048) ### Changed - Raised minimum requirement for building rocSOLVER from source to CMake 3.8 - Switched to use semantic versioning for the library - Enabled automatic reallocation of memory workspace in rocsolver clients ### Removed - Removed `-DOPTIMAL` from the `roc::rocsolver` CMake usage requirements. This is an internal rocSOLVER definition, and does not need to be defined by library users ### Fixed - Fixed runtime errors in debug mode caused by incorrect kernel launch bounds - Fixed complex unit test bug caused by incorrect zaxpy function signature - Eliminated a small memory transfer that was being done on the default stream - Fixed GESVD right singular vectors for 1x1 matrices ## rocSOLVER 3.10.0 for ROCm 3.10.0 ### Added - Orthonormal/Unitary matrix generator routines (reverse order): - ORG2L, UNG2L, ORGQL, UNGQL - ORGTR, UNGTR - Orthonormal/Unitary matrix multiplications routines (reverse order): - ORM2L, UNM2L, ORMQL, UNMQL - ORMTR, UNMTR ### Changed - Major library refactoring to adopt rocBLAS memory model ### Fixed - Returned values in parameter info of functions dealing with singularities ## rocSOLVER 3.9.0 for ROCm 3.9.0 ### Added - Improved debug build mode for developers - QL factorization routines: - GEQL2, GEQLF (with batched and strided\_batched versions) - SVD of general matrices routines: - GESVD (with batched and strided\_batched versions) ### Optimized - Improved performance of mid-size matrix inversion (64 < n <= 2048) ## rocSOLVER 3.8.0 for ROCm 3.8.0 ### Added - Sample codes for C, C++ and FORTRAN - LU factorization without pivoting routines: - GETF2\_NPVT, GETRF\_NPVT (with batched and strided\_batched versions) ### Optimized - Improved performance of LU factorization of mid-size matrices (64 < n <= 2048) - Improved performance of small-size matrix inversion (n <= 64) ### Fixed - Ensure the public API is C compatible ## rocSOLVER 3.7.0 for ROCm 3.7.0 ### Added - LU-factorization-based matrix inverse routines: - GETRI (with batched and strided\_batched versions) - SVD of bidiagonal matrices routine: - BDSQR ### Fixed - Ensure congruency on the input data when executing performance tests (benchmarks) ## rocSOLVER 3.6.0 for ROCm 3.6.0 ### Added - Complex precision support for all existing rocSOLVER functions - Bidiagonalization routines: - LABRD - GEBD2, GEBRD (with batched and strided\_batched versions) - Integration of rocSOLVER to hipBLAS ### Optimized - Improved performance of LU factorization of tiny matrices (n <= 64) ### Changed - Major clients refactoring to achieve better test coverage and benchmarking ## rocSOLVER 3.5.0 for ROCm 3.5.0 ### Added - Installation script and new build procedure - Documentation and integration with ReadTheDocs - Orthonormal matrix multiplication routines: - ORM2R, ORMQR - ORML2, ORMLQ - ORMBR ### Changed - Switched to use all rocBLAS types and enumerations - Major library refactoring to achieve better integration and rocBLAS support - hip-clang is now default compiler ### Deprecated - rocSOLVER types and enumerations - hcc compiler support rocSOLVER-rocm-6.4.3/CMakeLists.txt000066400000000000000000000310551503202240500167350ustar00rootroot00000000000000# ########################################################################## # Copyright (C) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ########################################################################## cmake_minimum_required(VERSION 3.13) # This has to be initialized before the project() command appears # Set the default build type to Release if(NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel.") endif() if(NOT DEFINED CMAKE_Fortran_COMPILER AND NOT DEFINED ENV{FC}) set(CMAKE_Fortran_COMPILER "gfortran") endif() # ROCM_BUILD_ID is added to the package name by rocm-cmake. Unsetting it prevents that. unset(ENV{ROCM_BUILD_ID}) # Disable ROCMClang detection to make CMake v3.21 work the same as CMake v3.20 and earlier. # https://gitlab.kitware.com/cmake/cmake/-/merge_requests/6533 if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.21.0 AND CMAKE_VERSION VERSION_LESS 3.21.3) set(__skip_rocmclang ON) endif() message(STATUS "Using CMake ${CMAKE_VERSION}") project(rocsolver LANGUAGES CXX C) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_CXX_STANDARD_REQUIRED ON) option(ROCSOLVER_EMBED_FMT "Hide libfmt symbols" ON) option(OPTIMAL "Build specialized kernels for small matrix sizes" ON) option(ROCSOLVER_FIND_PACKAGE_LAPACK_CONFIG "Skip module mode search for LAPACK" ON) option(ROCSOLVER_USE_INTERNAL_BLAS "Use internal implementation of GEMM and TRSM for debugging." OFF) option(ROCSOLVER_USE_REFERENCE_SECULAR_EQUATIONS_SOLVER "Use LAPACK like implementation of secular equations solver" ON) # Add our CMake helper files to the lookup path list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) find_package(fmt REQUIRED) # ######################################################################## # Main # ######################################################################## # Get rocm-cmake include(get-rocm-cmake) # Include the rocm-cmake components we use include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMCheckTargetIds) include(ROCMClients) include(ROCMHeaderWrapper) include(os-detection) get_os_id(OS_ID) message(STATUS "OS detected is ${OS_ID}") # Versioning via rocm-cmake set(VERSION_STRING "3.28.2") rocm_setup_version(VERSION ${VERSION_STRING}) # Workaround until llvm and hip CMake modules fix symlink logic in their config files list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm ${ROCM_PATH}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip ) if(CMAKE_BUILD_TYPE STREQUAL "Debug") set(DEFAULT_ARMOR_LEVEL 1) else() set(DEFAULT_ARMOR_LEVEL 0) endif() set(ARMOR_LEVEL "${DEFAULT_ARMOR_LEVEL}" CACHE STRING "Enables increasingly expensive runtime correctness checks") include(armor-config) # This option only works for make, nmake and ninja, but no reason it shouldn't be on all the time # It creates a compile_commands.json file for use with clang tooling or vim set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # BUILD_SHARED_LIBS is a cmake built-in # Make it an explicit option such that it shows in cmake-gui option(BUILD_SHARED_LIBS "Build rocSOLVER as a shared library" ON) # Include helper functions and wrapper functions include(util) include(CMakeDependentOption) include(CheckCXXCompilerFlag) option(BUILD_TESTING "Build rocSOLVER tests" OFF) if(BUILD_TESTING) enable_testing() endif() if(BUILD_SHARED_LIBS) set(BUILD_WITH_SPARSE_DEFAULT OFF) else() set(BUILD_WITH_SPARSE_DEFAULT ON) endif() option(BUILD_LIBRARY "Build rocSOLVER library" ON) option_opposite(BUILD_LIBRARY SKIP_LIBRARY) option(BUILD_WITH_SPARSE "Build with rocsparse available at build time" "${BUILD_WITH_SPARSE_DEFAULT}") option(BUILD_CLIENTS_TESTS "Build rocSOLVER test client" "${BUILD_TESTING}") option(BUILD_CLIENTS_BENCHMARKS "Build rocSOLVER benchmark client" OFF) option(BUILD_CLIENTS_SAMPLES "Build rocSOLVER samples" OFF) cmake_dependent_option(BUILD_CLIENTS_EXTRA_TESTS "Build extra tests" OFF BUILD_TESTING OFF) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(BUILD_CODE_COVERAGE "Build rocSOLVER with code coverage enabled" OFF) option(WERROR "Treat warnings as errors" OFF) option(BUILD_COMPRESSED_DBG "Enable compressed debug symbols" ON) check_cxx_compiler_flag("--offload-compress" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) cmake_dependent_option(BUILD_OFFLOAD_COMPRESS "Build with offload compression" ON CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS OFF) cmake_dependent_option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg backward compatibility enabled" OFF "NOT WIN32" OFF) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY) rocm_wrap_header_dir( ${CMAKE_SOURCE_DIR}/library/include/rocsolver PATTERNS "*.h" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} ) endif() message(STATUS "Tests: ${BUILD_CLIENTS_TESTS}") message(STATUS "Benchmarks: ${BUILD_CLIENTS_BENCHMARKS}") message(STATUS "Samples: ${BUILD_CLIENTS_SAMPLES}") if(NOT DEFINED AMDGPU_TARGETS) set(XNACK_PLUS_TARGETS gfx90a:xnack+ gfx908:xnack+ gfx942:xnack+) set(XNACK_MINUS_TARGETS gfx90a:xnack-) set(MISC_TARGETS gfx942 gfx1100 gfx1101 gfx1102 gfx1151 gfx1200 gfx1201) if(BUILD_ADDRESS_SANITIZER) set(OPTIONAL_TARGETS_QUERY ${XNACK_PLUS_TARGETS}) else() set(OPTIONAL_TARGETS_QUERY ${XNACK_MINUS_TARGETS} ${MISC_TARGETS}) set(DEFAULT_TARGETS gfx900 gfx906:xnack- gfx908:xnack- gfx1010 gfx1030) endif() # Query for compiler support of GPU archs rocm_check_target_ids(OPTIONAL_AMDGPU_TARGETS TARGETS ${OPTIONAL_TARGETS_QUERY}) set(AMDGPU_TARGETS_INIT ${OPTIONAL_AMDGPU_TARGETS} ${DEFAULT_TARGETS}) endif() # Set this before finding hip so that hip::device has the required arch flags # added as usage requirements on its interface set(AMDGPU_TARGETS "${AMDGPU_TARGETS_INIT}" CACHE STRING "List of specific machine types for library to target") # Find HIP dependencies find_package(hip REQUIRED CONFIG PATHS ${ROCM_PATH} /opt/rocm) find_package(rocblas REQUIRED CONFIG PATHS ${ROCM_PATH}) get_imported_target_location(location roc::rocblas) message(STATUS "Found rocBLAS: ${location}") set(rocblas_minimum 4.4) rocm_package_add_dependencies(SHARED_DEPENDS "rocblas >= ${rocblas_minimum}") rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocblas-static-devel >= ${rocblas_minimum}") rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocblas-static-dev >= ${rocblas_minimum}") if(BUILD_WITH_SPARSE) find_package(rocsparse REQUIRED CONFIG PATHS ${ROCM_PATH}) get_imported_target_location(location roc::rocsparse) message(STATUS "Found rocSPARSE: ${location}") set(rocsparse_minimum 2.2) rocm_package_add_dependencies(SHARED_DEPENDS "rocsparse >= ${rocsparse_minimum}") rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocsparse-static-devel >= ${rocsparse_minimum}") rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocsparse-static-dev >= ${rocsparse_minimum}") else() list(APPEND CPACK_DEBIAN_RUNTIME_PACKAGE_RECOMMENDS "rocsparse") list(APPEND CPACK_RPM_RUNTIME_PACKAGE_SUGGESTS "rocsparse") endif() find_package(rocprim REQUIRED CONFIG PATHS ${ROCM_PATH}) rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocprim-static-devel") rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocprim-static-dev") add_subdirectory(common) if(BUILD_LIBRARY) add_subdirectory(library) endif() if(BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_SAMPLES) if(NOT CLIENTS_OS) rocm_set_os_id(CLIENTS_OS) string(TOLOWER "${CLIENTS_OS}" CLIENTS_OS) rocm_read_os_release(CLIENTS_OS_VERSION VERSION_ID) endif() set(GFORTRAN_RPM "libgfortran4") set(GFORTRAN_DEB "libgfortran4") if(CLIENTS_OS STREQUAL "centos" OR CLIENTS_OS STREQUAL "rhel") if(CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "8") set(GFORTRAN_RPM "libgfortran") endif() elseif(CLIENTS_OS STREQUAL "ubuntu" AND CLIENTS_OS_VERSION VERSION_GREATER_EQUAL "20.04") set(GFORTRAN_DEB "libgfortran5") elseif(CLIENTS_OS STREQUAL "mariner" OR CLIENTS_OS STREQUAL "azurelinux") set(GFORTRAN_RPM "gfortran") endif() rocm_package_setup_component(clients) if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component(tests DEPENDS DEB "${GFORTRAN_DEB}" RPM "${GFORTRAN_RPM}") endif() if(BUILD_CLIENTS_BENCHMARKS) rocm_package_setup_client_component(benchmarks DEPENDS DEB "${GFORTRAN_DEB}" RPM "${GFORTRAN_RPM}") endif() add_subdirectory(clients) endif() if(OS_ID_sles) rocm_package_add_rpm_dependencies("libLLVM >= 7.0.1") endif() set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE.md") set(CPACK_RPM_PACKAGE_LICENSE "BSD") if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) else() if(NOT CPACK_PACKAGING_INSTALL_PREFIX) set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}") endif() endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) set(ROCSOLVER_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file") rocm_create_package( NAME rocsolver DESCRIPTION "AMD ROCm SOLVER library" MAINTAINER "RocSOLVER maintainer " LDCONFIG LDCONFIG_DIR ${ROCSOLVER_CONFIG_DIR} ) # Code Coverage Build Commands: # make coverage_cleanup (clean coverage related files) # make coverage GTEST_FILTER=<> # make coverage_analysis GTEST_FILTER=<> (analyze tests) # make coverage_output (generate html documentation) if(BUILD_CODE_COVERAGE) # Run coverage analysis add_custom_target(coverage_analysis COMMAND echo Coverage GTEST_FILTER=\${GTEST_FILTER} COMMAND ./clients/staging/rocsolver-test --gtest_filter=\"\${GTEST_FILTER}\" WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) add_dependencies(coverage_analysis rocsolver) # Generate gcov-tool script # This little script is generated because the option '--gcov-tool ' of lcov cannot take arguments. add_custom_target(coverage_output DEPENDS coverage_analysis COMMAND mkdir -p lcoverage COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh COMMAND chmod +x llvm-gcov.sh ) # Generate code coverage report add_custom_command(TARGET coverage_output COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info COMMAND genhtml --ignore-errors source lcoverage/main_coverage.info --output-directory lcoverage ) add_custom_target(coverage DEPENDS coverage_output) # Delete gcov data files add_custom_target(coverage_cleanup COMMAND find ${CMAKE_BINARY_DIR} -name *.gcda -delete WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() rocSOLVER-rocm-6.4.3/CONTRIBUTING.md000066400000000000000000000043601503202240500164250ustar00rootroot00000000000000# Contributing ## Philosophy AMD welcomes contributions from the community. Whether those contributions are bug reports, bug fixes, documentation additions, performance notes, or other improvements, we value collaboration with our users. We can build better solutions together. # Submitting a Pull Request To contribute changes to rocSOLVER, open a pull request targeting the `develop` branch. Pull requests will be tested and reviewed by the AMD development team. AMD may request changes or modify the submission before acceptance. ## Interface requirements The public interface must be: - C99 compatible - Source and binary compatible with previous releases - Fully documented with Doxygen and Sphinx All identifiers in the public headers must be prefixed with `rocblas`, `ROCBLAS`, `rocsolver`, or `ROCSOLVER`. All user-visible symbols must be prefixed with `rocblas` or `rocsolver`. ## Style guide In general, follow the style of the surrounding code. All code is auto-formatted using clang-format. To apply the rocsolver formatting, run `clang-format -i -style=file ` on any files you've changed. You can install git hooks to do this automatically upon commit by running `scripts/install-hooks --get-clang-format`. If you find you'd rather not use the hooks, they can be removed using `scripts/uninstall-hooks`. ## Tests To run the rocSOLVER test suite, first build the rocSOLVER test client following the instructions in [Building and Installation][1]. Then, run the `rocsolver-test` binary. For a typical build, the test binary will be found at `./build/release/clients/staging/rocsolver-test`. The full test suite is quite large and may take a long time to complete, so passing the [`--gtest_filter=`][2] option to rocsolver-test may be useful during development. A fast subset of tests can be run with `--gtest_filter='checkin*'`, while the extended tests can be run with `--gtest_filter='daily*'`. ## Rejected contributions Unfortunately, sometimes a contribution cannot be accepted. The rationale for a decision may or may not be disclosed. [1]: https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/userguide/install.html [2]: https://github.com/google/googletest/blob/release-1.10.0/googletest/docs/advanced.md#running-a-subset-of-the-tests rocSOLVER-rocm-6.4.3/LICENSE.md000066400000000000000000000120101503202240500155670ustar00rootroot00000000000000Copyright (c) 2018-2025 Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. This product includes code derived from the LAPACK and MAGMA projects. Copyright holders for these projects are indicated below, and distributed under their license terms as specified. -- LAPACK -- - Copyright (c) 1992-2013 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. - Copyright (c) 2000-2013 The University of California Berkeley. All rights reserved. - Copyright (c) 2006-2013 The University of Colorado Denver. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. - Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. The copyright holders provide no reassurances that the source code provided does not infringe any patent, copyright, or any other intellectual property rights of third parties. The copyright holders disclaim any liability to any recipient for claims brought against recipient by any third party for infringement of that parties intellectual property rights. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- MAGMA -- Copyright (c) 2009-2021 The University of Tennessee. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. - Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage. rocSOLVER-rocm-6.4.3/README.md000066400000000000000000000110451503202240500154510ustar00rootroot00000000000000# rocSOLVER rocSOLVER is a work-in-progress implementation of a subset of [LAPACK][1] functionality on the [ROCm platform][2]. ## Documentation > [!NOTE] > The published rocSOLVER documentation is available at [rocSOLVER](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the rocSOLVER/docs folder of this repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). ### How to build documentation Please follow the instructions below to build the documentation. ``` cd docs pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Building rocSOLVER To download the rocSOLVER source code, clone this repository with the command: git clone https://github.com/ROCmSoftwarePlatform/rocSOLVER.git rocSOLVER requires rocBLAS as a companion GPU BLAS implementation. For more information about rocBLAS and how to install it, see the [rocBLAS documentation][4]. After a standard installation of rocBLAS, the following commands will build rocSOLVER and install to `/opt/rocm`: cd rocSOLVER ./install.sh -i Once installed, rocSOLVER can be used just like any other library with a C API. The header file will need to be included in the user code, and both the rocBLAS and rocSOLVER shared libraries will become link-time and run-time dependencies for the user application. If you are a developer contributing to rocSOLVER, you may wish to run `./scripts/install-hooks` to install the git hooks for autoformatting. You may also want to take a look at the [contributing guidelines][7] ## Using rocSOLVER The following code snippet shows how to compute the QR factorization of a general m-by-n real matrix in double precision using rocSOLVER. A longer version of this example is provided by `example_basic.cpp` in the [samples directory][5]. For a description of the `rocsolver_dgeqrf` function, see the [rocSOLVER API documentation][6]. ```cpp ///////////////////////////// // example.cpp source code // ///////////////////////////// #include // for std::min #include // for size_t #include #include // for hip functions #include // for all the rocsolver C interfaces and type declarations int main() { rocblas_int M; rocblas_int N; rocblas_int lda; // here is where you would initialize M, N and lda with desired values rocblas_handle handle; rocblas_create_handle(&handle); size_t size_A = size_t(lda) * N; // the size of the array for the matrix size_t size_piv = size_t(std::min(M, N)); // the size of array for the Householder scalars std::vector hA(size_A); // creates array for matrix in CPU std::vector hIpiv(size_piv); // creates array for householder scalars in CPU double *dA, *dIpiv; hipMalloc(&dA, sizeof(double)*size_A); // allocates memory for matrix in GPU hipMalloc(&dIpiv, sizeof(double)*size_piv); // allocates memory for scalars in GPU // here is where you would initialize matrix A (array hA) with input data // note: matrices must be stored in column major format, // i.e. entry (i,j) should be accessed by hA[i + j*lda] // copy data to GPU hipMemcpy(dA, hA.data(), sizeof(double)*size_A, hipMemcpyHostToDevice); // compute the QR factorization on the GPU rocsolver_dgeqrf(handle, M, N, dA, lda, dIpiv); // copy the results back to CPU hipMemcpy(hA.data(), dA, sizeof(double)*size_A, hipMemcpyDeviceToHost); hipMemcpy(hIpiv.data(), dIpiv, sizeof(double)*size_piv, hipMemcpyDeviceToHost); // the results are now in hA and hIpiv, so you can use them here hipFree(dA); // de-allocate GPU memory hipFree(dIpiv); rocblas_destroy_handle(handle); // destroy handle } ``` The exact command used to compile the example above may vary depending on the system environment, but here is a typical example: /opt/rocm/bin/hipcc -I/opt/rocm/include -c example.cpp /opt/rocm/bin/hipcc -o example -L/opt/rocm/lib -lrocsolver -lrocblas example.o [1]: https://www.netlib.org/lapack/ [2]: https://rocm.docs.amd.com/ [3]: https://rocm.docs.amd.com/projects/rocSOLVER/ [4]: https://rocm.docs.amd.com/projects/rocBLAS/ [5]: clients/samples/ [6]: https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/api/lapack.html#rocsolver-type-geqrf [7]: CONTRIBUTING.md rocSOLVER-rocm-6.4.3/bump_rocsolver_version.sh000077500000000000000000000010151503202240500213330ustar00rootroot00000000000000#!/bin/sh # run this script in develop after creating release-staging branch for feature-complete date # Edit script to bump versions for new development cycle/release. # for rocSOLVER version string OLD_ROCSOLVER_VERSION="3\.28\.0" NEW_ROCSOLVER_VERSION="3.29.0" sed -i "s/${OLD_ROCSOLVER_VERSION}/${NEW_ROCSOLVER_VERSION}/g" CMakeLists.txt # for rocSOLVER library name OLD_ROCSOLVER_SOVERSION="0\.4" NEW_ROCSOLVER_SOVERSION="0.5" sed -i "s/${OLD_ROCSOLVER_SOVERSION}/${NEW_ROCSOLVER_SOVERSION}/g" library/CMakeLists.txt rocSOLVER-rocm-6.4.3/clients/000077500000000000000000000000001503202240500156325ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/CMakeLists.txt000077500000000000000000000161561503202240500204060ustar00rootroot00000000000000# ########################################################################## # Copyright (C) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ########################################################################## project(rocsolver-clients LANGUAGES C CXX) if(UNIX) enable_language(Fortran) endif() # Specify where to put the client binaries set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging") # The rocsolver target will exist if the library is being built along with the clients, # but if this is a clients-only build, we'll have to search for it. if(NOT TARGET rocsolver) find_package(rocsolver REQUIRED CONFIG PATHS ${ROCM_PATH}/rocsolver /opt/rocm/rocsolver) get_imported_target_location(location roc::rocsolver) message(STATUS "Found rocSOLVER: ${location}") endif() if(BUILD_CLIENTS_BENCHMARKS OR BUILD_CLIENTS_TESTS) if(ROCSOLVER_FIND_PACKAGE_LAPACK_CONFIG) find_package(LAPACK 3.7 REQUIRED CONFIG) else() find_package(LAPACK 3.7 REQUIRED) endif() if(NOT LAPACK_LIBRARIES) set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES} ) endif() add_library(clients-common INTERFACE) target_include_directories(clients-common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR} ) target_link_libraries(clients-common INTERFACE ${LAPACK_LIBRARIES} $<$:stdc++fs> fmt::fmt ) target_link_options(clients-common INTERFACE ${LAPACK_LINKER_FLAGS} ) set(rocauxiliary_inst_files common/auxiliary/testing_lacgv.cpp common/auxiliary/testing_laswp.cpp common/auxiliary/testing_larfg.cpp common/auxiliary/testing_larf.cpp common/auxiliary/testing_larft.cpp common/auxiliary/testing_larfb.cpp common/auxiliary/testing_lasr.cpp common/auxiliary/testing_latrd.cpp common/auxiliary/testing_labrd.cpp common/auxiliary/testing_lauum.cpp common/auxiliary/testing_bdsqr.cpp common/auxiliary/testing_bdsvdx.cpp common/auxiliary/testing_steqr.cpp common/auxiliary/testing_stedc.cpp common/auxiliary/testing_stedcj.cpp common/auxiliary/testing_stedcx.cpp common/auxiliary/testing_stein.cpp common/auxiliary/testing_lasyf.cpp common/auxiliary/testing_sterf.cpp common/auxiliary/testing_stebz.cpp common/auxiliary/testing_orgxr_ungxr.cpp common/auxiliary/testing_orgxl_ungxl.cpp common/auxiliary/testing_orglx_unglx.cpp common/auxiliary/testing_orgbr_ungbr.cpp common/auxiliary/testing_orgtr_ungtr.cpp common/auxiliary/testing_ormxr_unmxr.cpp common/auxiliary/testing_ormxl_unmxl.cpp common/auxiliary/testing_ormlx_unmlx.cpp common/auxiliary/testing_ormbr_unmbr.cpp common/auxiliary/testing_ormtr_unmtr.cpp ) set(roclapack_inst_files common/lapack/testing_potf2_potrf.cpp common/lapack/testing_potrs.cpp common/lapack/testing_posv.cpp common/lapack/testing_potri.cpp common/lapack/testing_getf2_getrf_npvt.cpp common/lapack/testing_getf2_getrf.cpp common/lapack/testing_geqr2_geqrf.cpp common/lapack/testing_gerq2_gerqf.cpp common/lapack/testing_geql2_geqlf.cpp common/lapack/testing_gelq2_gelqf.cpp common/lapack/testing_getrs.cpp common/lapack/testing_gesv.cpp common/lapack/testing_gesvd.cpp common/lapack/testing_gesdd.cpp common/lapack/testing_gesvdj.cpp common/lapack/testing_gesvdx.cpp common/lapack/testing_trtri.cpp common/lapack/testing_getri.cpp common/lapack/testing_getri_npvt.cpp common/lapack/testing_getri_outofplace.cpp common/lapack/testing_getri_npvt_outofplace.cpp common/lapack/testing_gels.cpp common/lapack/testing_gebd2_gebrd.cpp common/lapack/testing_sytf2_sytrf.cpp common/lapack/testing_sytxx_hetxx.cpp common/lapack/testing_sygsx_hegsx.cpp common/lapack/testing_syev_heev.cpp common/lapack/testing_syevd_heevd.cpp common/lapack/testing_syevdj_heevdj.cpp common/lapack/testing_syevdx_heevdx.cpp common/lapack/testing_syevj_heevj.cpp common/lapack/testing_syevx_heevx.cpp common/lapack/testing_sygv_hegv.cpp common/lapack/testing_sygvd_hegvd.cpp common/lapack/testing_sygvdj_hegvdj.cpp common/lapack/testing_sygvdx_hegvdx.cpp common/lapack/testing_sygvj_hegvj.cpp common/lapack/testing_sygvx_hegvx.cpp common/lapack/testing_geblttrf_npvt.cpp common/lapack/testing_geblttrf_npvt_interleaved.cpp common/lapack/testing_geblttrs_npvt.cpp common/lapack/testing_geblttrs_npvt_interleaved.cpp ) set(rocrefact_inst_files common/refact/testing_csrrf_analysis.cpp common/refact/testing_csrrf_sumlu.cpp common/refact/testing_csrrf_splitlu.cpp common/refact/testing_csrrf_refactlu.cpp common/refact/testing_csrrf_refactchol.cpp common/refact/testing_csrrf_solve.cpp ) set(common_source_files common/misc/lapack_host_reference.cpp common/misc/rocsolver_test.cpp common/misc/clients_utility.cpp common/misc/program_options.cpp common/misc/client_environment_helpers.cpp common/matrix_utils/matrix_utils.cpp ${rocauxiliary_inst_files} ${roclapack_inst_files} ${rocrefact_inst_files} ) prepend_path("${CMAKE_CURRENT_SOURCE_DIR}/" common_source_files common_source_paths) target_sources(clients-common INTERFACE ${common_source_paths}) # Copy and point to sparse test data file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/sparsedata/ DESTINATION ${PROJECT_BINARY_DIR}/staging/sparsedata/ ) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/sparsedata/ DESTINATION ${CMAKE_INSTALL_DATADIR}/rocsolver/test COMPONENT tests ) target_compile_definitions(clients-common INTERFACE -DROCBLAS_NO_DEPRECATED_WARNINGS ) if(BUILD_CLIENTS_BENCHMARKS) add_subdirectory(benchmarks) endif() if(BUILD_CLIENTS_TESTS) add_subdirectory(gtest) endif() endif() if(BUILD_CLIENTS_SAMPLES) add_subdirectory(samples) endif() if(BUILD_CLIENTS_EXTRA_TESTS) add_subdirectory(extras) endif() rocSOLVER-rocm-6.4.3/clients/benchmarks/000077500000000000000000000000001503202240500177475ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/benchmarks/CMakeLists.txt000077500000000000000000000035411503202240500225150ustar00rootroot00000000000000# ########################################################################## # Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # ########################################################################## add_executable(rocsolver-bench client.cpp) add_armor_flags(rocsolver-bench "${ARMOR_LEVEL}") target_link_libraries(rocsolver-bench PRIVATE Threads::Threads hip::device rocsolver-common clients-common roc::rocsolver ) target_compile_definitions(rocsolver-bench PRIVATE ROCM_USE_FLOAT16 ROCSOLVER_CLIENTS_BENCH ) rocm_install(TARGETS rocsolver-bench COMPONENT benchmarks) rocSOLVER-rocm-6.4.3/clients/benchmarks/client.cpp000066400000000000000000000675241503202240500217470ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include #include #include "common/misc/program_options.hpp" #include "common/misc/rocsolver_dispatcher.hpp" using namespace roc; // clang-format off const char* help_str = R"HELP_STR( rocSOLVER benchmark client help. Usage: ./rocsolver-bench In addition to some common general options, the following list of options corresponds to all the parameters that might be needed to test a given rocSOLVER function. The parameters are named as in the API user guide. The arrays are initialized internally by the program with random values. Note: When a required parameter/option is not provided, it will take the default value as listed below. If no default value is defined, the program will try to calculate a suitable value depending on the context of the problem and the tested function; if this is not possible, the program will abort with an error. Functions that accept multiple size parameters can generally be provided a single size parameter (typically, m) and a square-size matrix will be assumed. Example: ./rocsolver-bench -f getf2_batched -m 30 --lda 75 --batch_count 350 This will test getf2_batched with a set of 350 random 30x30 matrices. strideP will be set to be equal to 30. Options: )HELP_STR"; // clang-format on static std::string rocblas_version() { size_t size; rocblas_get_version_string_size(&size); std::string str(size - 1, '\0'); rocblas_get_version_string(str.data(), size); return str; } static std::string rocsolver_version() { size_t size; rocsolver_get_version_string_size(&size); std::string str(size - 1, '\0'); rocsolver_get_version_string(str.data(), size); return str; } static void print_version_info() { fmt::print("rocSOLVER version {} (with rocBLAS {})\n", rocsolver_version(), rocblas_version()); std::fflush(stdout); } int main(int argc, char* argv[]) try { Arguments argus; // disable unit_check in client benchmark, it is only // used in gtest unit test argus.unit_check = 0; // enable timing check,otherwise no performance data collected argus.timing = 1; std::string function; char precision = 's'; rocblas_int device_id = 0; // take arguments and set default values // clang-format off options_description desc("rocsolver client command line options"); desc.add_options()("help,h", "Produces this help message.") // test options ("batch_count", value(&argus.batch_count)->default_value(1), "Number of matrices or problem instances in the batch.\n" " Only applicable to batch routines.\n" " ") ("device", value(&device_id)->default_value(0), "Set the default device to be used for subsequent program runs.\n" " ") ("function,f", value(&function)->default_value("potf2"), "The LAPACK function to test.\n" " Options are: getf2, getrf, gesvd_batched, etc.\n" " ") ("iters,i", value(&argus.iters)->default_value(10), "Iterations to run inside the GPU timing loop.\n" " Reported time will be the average.\n" " ") ("alg_mode", value(&argus.alg_mode)->default_value(0), "0 = GPU-only, 1 = Hybrid\n" " This will change how the algorithm operates.\n" " Only applicable to functions with hybrid support.\n" " ") ("mem_query", value(&argus.mem_query)->default_value(0), "Calculate the required amount of device workspace memory? 0 = No, 1 = Yes.\n" " This forces the client to print only the amount of device memory required by\n" " the function, in bytes.\n" " ") ("perf", value(&argus.perf)->default_value(0), "Ignore CPU timing results? 0 = No, 1 = Yes.\n" " This forces the client to print only the GPU time and the error if requested.\n" " ") ("precision,r", value(&precision)->default_value('s'), "Precision to be used in the tests.\n" " Options are: s, d, c, z.\n" " ") ("profile", value(&argus.profile)->default_value(0), "Print profile logging results for the tested function.\n" " The argument specifies the max depth of the nested output.\n" " If the argument is unset or <= 0, profile logging is disabled.\n" " ") ("profile_kernels", value(&argus.profile_kernels)->default_value(0), "Include kernels in profile logging results? 0 = No, 1 = Yes.\n" " Used in conjunction with --profile to include kernels in the profile log.\n" " ") ("singular", value(&argus.singular)->default_value(0), "Test with degenerate matrices? 0 = No, 1 = Yes\n" " This will produce matrices that are singular, non positive-definite, etc.\n" " ") ("verify,v", value(&argus.norm_check)->default_value(0), "Validate GPU results with CPU? 0 = No, 1 = Yes.\n" " This will additionally print the relative error of the computations.\n" " ") ("hash", value(&argus.hash_check)->default_value(0), "Print hash of GPU results? 0 = No, 1 = Yes.\n" " Meant for checking reproducibility of computations.\n" " ") // size options ("k", value(), "Matrix/vector size parameter.\n" " Represents a sub-dimension of a problem.\n" " For example, the number of Householder reflections in a transformation.\n" " ") ("m", value(), "Matrix/vector size parameter.\n" " Typically, the number of rows of a matrix.\n" " ") ("n", value(), "Matrix/vector size parameter.\n" " Typically, the number of columns of a matrix,\n" " or the order of a system or transformation.\n" " ") ("nrhs", value(), "Matrix/vector size parameter.\n" " Typically, the number of columns of a matrix on the right-hand side of a problem.\n" " ") // increment options ("inca", value()->default_value(1), "Matrix/vector increment parameter.\n" " Increment between values in matrices A.\n" " ") ("incb", value()->default_value(1), "Matrix/vector increment parameter.\n" " Increment between values in matrices B.\n" " ") ("incc", value()->default_value(1), "Matrix/vector increment parameter.\n" " Increment between values in matrices C.\n" " ") ("incx", value()->default_value(1), "Matrix/vector increment parameter.\n" " Increment between values in matrices/vectors X.\n" " ") // leading dimension options ("lda", value(), "Matrix size parameter.\n" " Leading dimension of matrices A.\n" " ") ("ldb", value(), "Matrix size parameter.\n" " Leading dimension of matrices B.\n" " ") ("ldc", value(), "Matrix size parameter.\n" " Leading dimension of matrices C.\n" " ") ("ldt", value(), "Matrix size parameter.\n" " Leading dimension of matrices T.\n" " ") ("ldu", value(), "Matrix size parameter.\n" " Leading dimension of matrices U.\n" " ") ("ldv", value(), "Matrix size parameter.\n" " Leading dimension of matrices V.\n" " ") ("ldw", value(), "Matrix size parameter.\n" " Leading dimension of matrices W.\n" " ") ("ldx", value(), "Matrix size parameter.\n" " Leading dimension of matrices X.\n" " ") ("ldy", value(), "Matrix size parameter.\n" " Leading dimension of matrices Y.\n" " ") ("ldz", value(), "Matrix size parameter.\n" " Leading dimension of matrices Z.\n" " ") // stride options ("strideA", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors A.\n" " ") ("strideB", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors B.\n" " ") ("strideD", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors D.\n" " ") ("strideE", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors E.\n" " ") ("strideF", value(), "Matrix/vector stride parameter.\n" " Stride for vectors ifail.\n" " ") ("strideQ", value(), "Matrix/vector stride parameter.\n" " Stride for vectors tauq.\n" " ") ("strideP", value(), "Matrix/vector stride parameter.\n" " Stride for vectors tau, taup, and ipiv.\n" " ") ("strideS", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors S.\n" " ") ("strideU", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors U.\n" " ") ("strideV", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors V.\n" " ") ("strideW", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors W.\n" " ") ("strideX", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors X.\n" " ") ("strideZ", value(), "Matrix/vector stride parameter.\n" " Stride for matrices/vectors Z.\n" " ") // refactorization options ("nnzM", value(), "The number of non-zero elements in sparse matrix M.\n" " Currently only a few test cases can be generated.\n" " The benchmark client will use the available case closest to the input value.\n" " ") ("nnzA", value(), "The number of non-zero elements in sparse matrix A.\n" " Currently only a few test cases can be generated.\n" " The benchmark client will use the available case closest to the input value.\n" " ") ("nnzL", value(), "The number of non-zero elements in sparse matrix L.\n" " Currently only a few test cases can be generated.\n" " The benchmark client will use the available case closest to the input value.\n" " ") ("nnzU", value(), "The number of non-zero elements in sparse matrix U.\n" " Currently only a few test cases can be generated.\n" " The benchmark client will use the available case closest to the input value.\n" " ") ("nnzT", value(), "The number of non-zero elements in sparse matrix T.\n" " Currently only a few test cases can be generated.\n" " The benchmark client will use the available case closest to the input value.\n" " ") ("rfinfo_mode", value(), "Specifies the desired re-factorization algorithm.\n" " 1 = LU, 2 = Cholesky.\n" " ") // bdsqr options ("nc", value()->default_value(0), "The number of columns of matrix C.\n" " Only applicable to bdsqr.\n" " ") ("nu", value(), "The number of columns of matrix U.\n" " Only applicable to bdsqr.\n" " ") ("nv", value()->default_value(0), "The number of columns of matrix V.\n" " Only applicable to bdsqr.\n" " ") // bdsvdx options ("svect", value()->default_value('N'), "N = none, S or V = the singular vectors are computed.\n" " Indicates how the left singular vectors are to be calculated and stored.\n" " Only applicable to bdsvdx.\n" " ") // laswp options ("k1", value(), "First index for row interchange.\n" " Only applicable to laswp.\n" " ") ("k2", value(), "Last index for row interchange.\n" " Only applicable to laswp.\n" " ") // gesvd options ("left_svect", value()->default_value('N'), "N = none, A = the entire orthogonal matrix is computed,\n" " S or V = the singular vectors are computed,\n" " O = the singular vectors overwrite the original matrix.\n" " Indicates how the left singular vectors are to be calculated and stored.\n" " ") ("right_svect", value()->default_value('N'), "N = none, A = the entire orthogonal matrix is computed,\n" " S or V = the singular vectors are computed,\n" " O = the singular vectors overwrite the original matrix.\n" " Indicates how the right singular vectors are to be calculated and stored.\n" " ") // stein options ("nev", value(), "Number of eigenvectors to compute in a partial decomposition.\n" " Only applicable to stein.\n" " ") // trtri options ("diag", value()->default_value('N'), "N = non-unit triangular, U = unit triangular.\n" " Indicates whether the diagonal elements of a triangular matrix are assumed to be one.\n" " Only applicable to trtri.\n" " ") // stebz options ("eorder", value()->default_value('E'), "E = entire matrix, B = by blocks.\n" " Indicates whether the computed eigenvalues are ordered by blocks or for the entire matrix.\n" " Only applicable to stebz.\n" " ") // geblttrf/geblttrs options ("nb", value(), "Number of rows and columns in each block.\n" " Only applicable to block tridiagonal matrix APIs.\n" " ") ("nblocks", value(), "Number of blocks along the diagonal.\n" " Only applicable to block tridiagonal matrix APIs.\n" " ") // partial eigenvalue/singular value decomposition options ("il", value(), "Lower index in ordered subset of eigenvalues.\n" " Used in partial eigenvalue decomposition functions.\n" " ") ("iu", value(), "Upper index in ordered subset of eigenvalues.\n" " Used in partial eigenvalue decomposition functions.\n" " ") ("erange", value()->default_value('A'), "A = all eigenvalues, V = in (vl, vu], I = from the il-th to the iu-th.\n" " For partial eigenvalue decompositions, it indicates the type of interval in which\n" " the eigenvalues will be found.\n" " ") ("srange", value()->default_value('A'), "A = all singular values, V = in (vl, vu], I = from the il-th to the iu-th.\n" " For partial singular value decompositions, it indicates the type of interval in which\n" " the singular values will be found.\n" " ") ("vl", value(), "Lower bound of half-open interval (vl, vu].\n" " Used in partial eigenvalue decomposition functions.\n" " Note: the used random input matrices have all eigenvalues in [-20, 20].\n" " ") ("vu", value(), "Upper bound of half-open interval (vl, vu].\n" " Used in partial eigenvalue decomposition functions.\n" " Note: the used random input matrices have all eigenvalues in [-20, 20].\n" " ") // iterative Jacobi options ("max_sweeps", value()->default_value(100), "Maximum number of sweeps/iterations.\n" " Used in iterative Jacobi functions.\n" " ") ("esort", value()->default_value('A'), "N = no sorting, A = ascending order.\n" " Indicates whether the computed eigenvalues are sorted in ascending order.\n" " Used in iterative Jacobi functions.\n" " ") // other options ("abstol", value()->default_value(0), "Absolute tolerance at which convergence is accepted.\n" " Used in iterative Jacobi and partial eigenvalue decomposition functions.\n" " ") ("direct", value()->default_value('F'), "F = forward, B = backward.\n" " The order in which a series of transformations are applied.\n" " ") ("pivot", value()->default_value('V'), "V = variable, T = top, B = bottom.\n" " Defines the planes on which a sequence of rotations is applied.\n" " ") ("evect", value()->default_value('N'), "N = none, V = compute eigenvectors of the matrix,\n" " I = compute eigenvectors of the tridiagonal matrix.\n" " Indicates how the eigenvectors are to be calculated and stored.\n" " ") ("fast_alg", value()->default_value('O'), "O = out-of-place, I = in-place.\n" " Enables out-of-place computations.\n" " ") ("itype", value()->default_value('1'), "1 = Ax, 2 = ABx, 3 = BAx.\n" " Problem type for generalized eigenproblems.\n" " ") ("side", value(), "L = left, R = right.\n" " The side from which a matrix should be multiplied.\n" " ") ("storev", value(), "C = column-wise, R = row-wise.\n" " Indicates whether data is stored column-wise or row-wise.\n" " ") ("trans", value()->default_value('N'), "N = no transpose, T = transpose, C = conjugate transpose.\n" " Indicates if a matrix should be transposed.\n" " ") ("uplo", value()->default_value('U'), "U = upper, L = lower.\n" " Indicates where the data for a triangular or symmetric/hermitian matrix is stored.\n" " "); // clang-format on variables_map vm; store(parse_command_line(argc, argv, desc), vm); notify(vm); // print help message if(vm.count("help")) { std::stringstream desc_ss{}; desc_ss << desc; fmt::print("{}{}\n", help_str, desc_ss.str()); return 0; } argus.populate(vm); if(!argus.perf) { print_version_info(); rocblas_int device_count = query_device_property(); if(device_count <= 0) throw std::runtime_error("No devices found"); if(device_count <= device_id) throw std::invalid_argument("Invalid Device ID"); } set_device(device_id); // catch invalid arguments argus.validate_precision("precision"); argus.validate_operation("trans"); argus.validate_side("side"); argus.validate_fill("uplo"); argus.validate_diag("diag"); argus.validate_direct("direct"); argus.validate_pivot("pivot"); argus.validate_storev("storev"); argus.validate_svect("svect"); argus.validate_svect("left_svect"); argus.validate_svect("right_svect"); argus.validate_erange("srange"); argus.validate_workmode("fast_alg"); argus.validate_evect("evect"); argus.validate_erange("erange"); argus.validate_eorder("eorder"); argus.validate_esort("esort"); argus.validate_itype("itype"); argus.validate_rfinfo_mode("rfinfo_mode"); // prepare logging infrastructure and ignore environment variables rocsolver_log_begin(); rocsolver_log_set_layer_mode(rocblas_layer_mode_none); // select and dispatch function test/benchmark rocsolver_dispatcher::invoke(function, precision, argus); // terminate logging rocsolver_log_end(); return 0; } catch(const std::exception& exp) { fmt::print(stderr, "{}\n", exp.what()); return -1; } rocSOLVER-rocm-6.4.3/clients/common/000077500000000000000000000000001503202240500171225ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/common/auxiliary/000077500000000000000000000000001503202240500211315ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_bdsqr.cpp000066400000000000000000000032441503202240500245100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_bdsqr.hpp" #define TESTING_BDSQR(...) template void testing_bdsqr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_BDSQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_bdsqr.hpp000066400000000000000000000651231503202240500245210ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void bdsqr_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, S dD, S dE, T dV, const rocblas_int ldv, T dU, const rocblas_int ldu, T dC, const rocblas_int ldc, rocblas_int* dinfo) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_bdsqr(nullptr, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, rocblas_fill_full, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S) nullptr, dE, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, (S) nullptr, dV, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, (T) nullptr, ldv, dU, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, (T) nullptr, ldu, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, (T) nullptr, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, 0, nv, nu, nc, (S) nullptr, (S) nullptr, (T) nullptr, ldv, (T) nullptr, ldu, (T) nullptr, ldc, dinfo), rocblas_status_success); } template void testing_bdsqr_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 2; rocblas_int nv = 2; rocblas_int nu = 2; rocblas_int nc = 2; rocblas_int ldv = 2; rocblas_int ldu = 2; rocblas_int ldc = 2; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments bdsqr_checkBadArgs(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dinfo.data()); } template void bdsqr_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, Sd& dD, Sd& dE, Td& dV, const rocblas_int ldv, Td& dU, const rocblas_int ldu, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hV, Th& hU, Th& hC, Uh& hInfo, std::vector& D, std::vector& E, const bool test) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, false); // Adding possible gaps to fully test the algorithm. for(rocblas_int i = 0; i < n - 1; ++i) { hE[0][i] -= 5; hD[0][i] -= 4; } hD[0][n - 1] -= 4; // (Forcing non-convergence expecting lapack and rocsolver to give // the same orthogonal equivalent matrix is not possible. Testing // implicitly the equivalent matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). // make copy of original data to test vectors if required if(test && (nv || nu || nc)) { for(rocblas_int i = 0; i < n - 1; ++i) { E[i] = hE[0][i]; D[i] = hD[0][i]; } D[n - 1] = hD[0][n - 1]; } // make V,U and C identities so that results are actually singular vectors // of B if(nv > 0) { memset(hV[0], 0, ldv * nv * sizeof(T)); for(rocblas_int i = 0; i < std::min(n, nv); ++i) hV[0][i + i * ldv] = T(1.0); } if(nu > 0) { memset(hU[0], 0, ldu * n * sizeof(T)); for(rocblas_int i = 0; i < std::min(n, nu); ++i) hU[0][i + i * ldu] = T(1.0); } if(nc > 0) { memset(hC[0], 0, ldc * nc * sizeof(T)); for(rocblas_int i = 0; i < std::min(n, nc); ++i) hC[0][i + i * ldc] = T(1.0); } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(nv > 0) CHECK_HIP_ERROR(dV.transfer_from(hV)); if(nu > 0) CHECK_HIP_ERROR(dU.transfer_from(hU)); if(nc > 0) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void bdsqr_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, Sd& dD, Sd& dE, Td& dV, const rocblas_int ldv, Td& dU, const rocblas_int ldu, Td& dC, const rocblas_int ldc, Ud& dInfo, const rocblas_int nvT, const rocblas_int nuT, const rocblas_int nvRes, const rocblas_int nuRes, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hV, Th& hU, Th& hC, Uh& hInfo, Uh& hInfoRes, double* max_err, double* max_errv) { using S = decltype(std::real(T{})); std::vector hW(4 * n); std::vector D(n); std::vector E(n); // input data initialization bdsqr_initData(handle, uplo, n, nvRes, nuRes, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, true); // execute computations: // complementary execution to compute all singular vectors if needed if(nvT > 0 || nuT > 0) { // send data to GPU bdsqr_initData(handle, uplo, n, nvT, nuT, 0, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, false); CHECK_ROCBLAS_ERROR(rocsolver_bdsqr(handle, uplo, n, nvT, nuT, 0, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data())); } // send data to GPU bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, false); // execute computations // CPU lapack cpu_bdsqr(uplo, n, 0, 0, 0, hD[0], hE[0], hV[0], ldv, hU[0], ldu, hC[0], ldc, hW.data(), hInfo[0]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(nvRes > 0) CHECK_HIP_ERROR(hV.transfer_from(dV)); if(nuRes > 0) CHECK_HIP_ERROR(hU.transfer_from(dU)); if(nc > 0) CHECK_HIP_ERROR(hC.transfer_from(dC)); // Check info for non-convergence *max_err = 0; EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). // error is ||hD - hDRes|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) double err; T tmp; *max_errv = 0; err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hInfo[0][0] == 0 && (nv || nu)) { err = 0; rocblas_int n_comp = std::min(n, nvRes); if(uplo == rocblas_fill_upper) { // check singular vectors implicitly (A'*u_i = s_i*v_i) for(rocblas_int i = 0; i < n_comp; ++i) { for(rocblas_int j = 0; j < n; ++j) { if(i > 0) tmp = D[i] * hU[0][i + j * ldu] + E[i - 1] * hU[0][(i - 1) + j * ldu] - hDRes[0][j] * hV[0][j + i * ldv]; else tmp = D[i] * hU[0][i + j * ldu] - hDRes[0][j] * hV[0][j + i * ldv]; err += std::abs(tmp) * std::abs(tmp); } } } else { // check singular vectors implicitly (A*v_i = s_i*u_i) for(rocblas_int i = 0; i < n_comp; ++i) { for(rocblas_int j = 0; j < n; ++j) { if(i > 0) tmp = D[i] * hV[0][j + i * ldv] + E[i - 1] * hV[0][j + (i - 1) * ldv] - hDRes[0][j] * hU[0][i + j * ldu]; else tmp = D[i] * hV[0][j + i * ldv] - hDRes[0][j] * hU[0][i + j * ldu]; err += std::abs(tmp) * std::abs(tmp); } } } double normD = double(snorm('F', 1, n, D.data(), 1)); double normE = double(snorm('F', 1, n - 1, E.data(), 1)); err = std::sqrt(err) / std::sqrt(normD * normD + normE * normE); *max_errv = err > *max_errv ? err : *max_errv; } // C should be the transpose of U if(hInfo[0][0] == 0 && nc) { err = 0; for(rocblas_int i = 0; i < nc; ++i) { for(rocblas_int j = 0; j < n; ++j) { tmp = hC[0][j + i * ldc] - hU[0][i + j * ldu]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err); *max_errv = err > *max_errv ? err : *max_errv; } } template void bdsqr_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nv, const rocblas_int nu, const rocblas_int nc, Sd& dD, Sd& dE, Td& dV, const rocblas_int ldv, Td& dU, const rocblas_int ldu, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hV, Th& hU, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using S = decltype(std::real(T{})); std::vector hW(4 * n); std::vector D; std::vector E; if(!perf) { bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, false); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_bdsqr(uplo, n, nv, nu, nc, hD[0], hE[0], hV[0], ldv, hU[0], ldu, hC[0], ldc, hW.data(), hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, false); // cold calls for(int iter = 0; iter < 2; iter++) { bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, false); CHECK_ROCBLAS_ERROR(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { bdsqr_initData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, D, E, false); start = get_time_us_sync(stream); rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_bdsqr(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nv = argus.get("nv", 0); rocblas_int nu = argus.get("nu", 0); rocblas_int nc = argus.get("nc", 0); rocblas_int ldv = argus.get("ldv", nv > 0 ? n : 1); rocblas_int ldu = argus.get("ldu", nu > 0 ? nu : 1); rocblas_int ldc = argus.get("ldc", nc > 0 ? n : 1); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; if(argus.alg_mode == 1) { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid), rocblas_status_success); rocsolver_alg_mode alg_mode; EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode), rocblas_status_success); EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); } // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // sizes for testing singular vectors // TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY (NOT EXPLICITLY COMPARING // WITH LAPACK), SO WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF // THE RIGHT AND LEFT VECTORS rocblas_int nvA = nv, nuA = nu; rocblas_int nvT = 0, nuT = 0; rocblas_int nvRes = nv, nuRes = nu; rocblas_int ldvRes = ldv, lduRes = ldu; if(nv && nu) { nvRes = nvA = std::max(nv, nu); nuRes = nuA = std::max(nvRes, nc); lduRes = std::max(lduRes, nuRes); } else if(nu) { nvRes = nvT = nu; nuRes = nuA = std::max(nu, nc); ldvRes = n; lduRes = std::max(lduRes, nuRes); } else if(nv || nc) { nuRes = nuT = std::max(nv, nc); lduRes = nuRes; } // E, V, U, and C could have size zero in cases that are not quick-return or // invalid cases setting the size to one to avoid possible memory-access // errors in the rest of the unit test size_t size_D = size_t(n); size_t size_E = n > 1 ? size_t(n - 1) : 1; size_t size_V = std::max(size_t(ldv) * nv, size_t(1)); size_t size_U = std::max(size_t(ldu) * n, size_t(1)); size_t size_C = std::max(size_t(ldc) * nc, size_t(1)); size_t size_VRes = std::max(size_t(ldvRes) * nvRes, size_t(1)); size_t size_URes = std::max(size_t(lduRes) * n, size_t(1)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || nv < 0 || nu < 0 || nc < 0 || ldu < nu || ldv < 1 || ldc < 1) || (nv > 0 && ldv < n) || (nc > 0 && ldc < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldv, (T*)nullptr, ldu, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check computations if(argus.unit_check || argus.norm_check) { host_strided_batch_vector hDRes(size_D, 1, size_D, 1); host_strided_batch_vector hERes(size_E, 1, size_E, 1); host_strided_batch_vector hV(size_VRes, 1, size_VRes, 1); host_strided_batch_vector hU(size_URes, 1, size_URes, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dV(size_VRes, 1, size_VRes, 1); device_strided_batch_vector dU(size_URes, 1, size_URes, 1); if(size_VRes) CHECK_HIP_ERROR(dV.memcheck()); if(size_URes) CHECK_HIP_ERROR(dU.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsqr(handle, uplo, n, nv, nu, nc, dD.data(), dE.data(), dV.data(), ldv, dU.data(), ldu, dC.data(), ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } bdsqr_getError(handle, uplo, n, nvA, nuA, nc, dD, dE, dV, ldvRes, dU, lduRes, dC, ldc, dInfo, nvT, nuT, nvRes, nuRes, hD, hDRes, hE, hERes, hV, hU, hC, hInfo, hInfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { host_strided_batch_vector hV(size_V, 1, size_V, 1); host_strided_batch_vector hU(size_U, 1, size_U, 1); device_strided_batch_vector dV(size_V, 1, size_V, 1); device_strided_batch_vector dU(size_U, 1, size_U, 1); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); bdsqr_getPerfData(handle, uplo, n, nv, nu, nc, dD, dE, dV, ldv, dU, ldu, dC, ldc, dInfo, hD, hE, hV, hU, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); if(nv || nu || nc) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * n); } // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { if(nv || nu || nc) max_error = (max_error >= max_errorv) ? max_error : max_errorv; rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "nv", "nu", "nc", "ldv", "ldu", "ldc"); rocsolver_bench_output(uploC, n, nv, nu, nc, ldv, ldu, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_BDSQR(...) extern template void testing_bdsqr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_BDSQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_bdsvdx.cpp000066400000000000000000000032461503202240500246710ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_bdsvdx.hpp" #define TESTING_BDSVDX(...) template void testing_bdsvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_BDSVDX, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_bdsvdx.hpp000066400000000000000000000617211503202240500247000ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void bdsvdx_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_svect svect, const rocblas_srange srange, const rocblas_int n, U dD, U dE, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, rocblas_int* dNsv, U dS, U dZ, const rocblas_int ldz, rocblas_int* dIfail, rocblas_int* dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(nullptr, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, rocblas_fill_full, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, rocblas_svect_all, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, rocblas_srange(0), n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (U) nullptr, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, (U) nullptr, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, (rocblas_int*)nullptr, dS, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, (U) nullptr, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, (U) nullptr, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, (rocblas_int*)nullptr, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, 0, (U) nullptr, (U) nullptr, vl, vu, il, iu, dNsv, (U) nullptr, (U) nullptr, ldz, (rocblas_int*)nullptr, dInfo), rocblas_status_success); } template void testing_bdsvdx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 2; rocblas_fill uplo = rocblas_fill_upper; rocblas_svect svect = rocblas_svect_singular; rocblas_srange srange = rocblas_srange_all; rocblas_int ldz = 4; T vl = 0; T vu = 0; rocblas_int il = 0; rocblas_int iu = 0; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments bdsvdx_checkBadArgs(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); } template void bdsvdx_initData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Th& hD, Th& hE) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add fixed splits in the matrix to test split handling // (scaling ensures that all singular values are in [0, 20]) for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] = (hE[0][i] - 5) / 10; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); } } template void bdsvdx_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_svect svect, const rocblas_srange srange, const rocblas_int n, Td& dD, Td& dE, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, Ud& dNsv, Td& dS, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Th& hD, Th& hE, Uh& hNsv, Uh& hNsvRes, Th& hS, Th& hSRes, Th& hZ, Th& hZRes, Uh& hIfailRes, Uh& hInfo, Uh& hInfoRes, double* max_err) { std::vector work(14 * n); std::vector iwork(12 * n); // input data initialization bdsvdx_initData(handle, n, dD, dE, hD, hE); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); CHECK_HIP_ERROR(hNsvRes.transfer_from(dNsv)); CHECK_HIP_ERROR(hSRes.transfer_from(dS)); CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack // WORKAROUND: For some test cases, LAPACK's bdsvdx is returning incorrect singular values // when srange is rocblas_srange_index. In this case, we use rocblas_srange_all to get // all the singular values and offset and use il as an offset into the result array. rocblas_int ioffset = 0; if(srange == rocblas_srange_index) { cpu_bdsvdx(uplo, rocblas_svect_none, rocblas_srange_all, n, hD[0], hE[0], vl, vu, il, iu, hNsv[0], hS[0], hZ[0], ldz, work.data(), iwork.data(), hInfo[0]); ioffset = il - 1; hNsv[0][0] = iu - il + 1; } else { cpu_bdsvdx(uplo, rocblas_svect_none, srange, n, hD[0], hE[0], vl, vu, il, iu, hNsv[0], hS[0], hZ[0], ldz, work.data(), iwork.data(), hInfo[0]); } // check info EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; // if finding singular values succeeded, check values double err; if(hInfoRes[0][0] == 0) { // check number of computed singular values rocblas_int nn = hNsvRes[0][0]; *max_err += std::abs(nn - hNsv[0][0]); EXPECT_EQ(hNsv[0][0], hNsvRes[0][0]); // error is ||hS - hSRes|| / ||hS|| // using frobenius norm err = norm_error('F', 1, nn, 1, hS[0] + ioffset, hSRes[0]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required // U is stored in hZRes, and V is stored in hZRes+n if(svect != rocblas_svect_none) { // U and V should be orthonormal, if they are then U^T*U and V^T*V should be the identity if(nn > 0) { std::vector UUres(nn * nn, 0.0); std::vector VVres(nn * nn, 0.0); std::vector I(nn * nn, 0.0); for(rocblas_int i = 0; i < nn; i++) I[i + i * nn] = T(1); cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, n, T(1), hZRes[0], ldz, hZRes[0], ldz, T(0), UUres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), UUres.data()); *max_err = err > *max_err ? err : *max_err; cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, n, T(1), hZRes[0] + n, ldz, hZRes[0] + n, ldz, T(0), VVres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), VVres.data()); *max_err = err > *max_err ? err : *max_err; } err = 0; // form bidiagonal matrix B std::vector B(n * n); for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) B[i + j * n] = hD[0][i]; else if(i + 1 == j && uplo == rocblas_fill_upper) B[i + j * n] = hE[0][i]; else if(i == j + 1 && uplo == rocblas_fill_lower) B[i + j * n] = hE[0][j]; else B[i + j * n] = 0; } } // check singular vectors implicitly (B*v_k = s_k*u_k) for(rocblas_int k = 0; k < nn; ++k) { cpu_gemv(rocblas_operation_none, n, n, T(1), B.data(), n, hZRes[0] + n + k * ldz, 1, -hSRes[0][k], hZRes[0] + k * ldz, 1); } err = double(snorm('F', n, nn, hZRes[0], ldz)) / double(snorm('F', n, n, B.data(), n)); *max_err = err > *max_err ? err : *max_err; // check ifail err = 0; for(int j = 0; j < nn; j++) { EXPECT_EQ(hIfailRes[0][j], 0) << "where j = " << j; if(hIfailRes[0][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; } } else { if(svect != rocblas_svect_none) { // check ifail err = 0; for(int j = 0; j < hInfoRes[0][0]; j++) { EXPECT_NE(hIfailRes[0][j], 0) << "where j = " << j; if(hIfailRes[0][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } } template void bdsvdx_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_svect svect, const rocblas_srange srange, const rocblas_int n, Td& dD, Td& dE, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, Ud& dNsv, Td& dS, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Th& hD, Th& hE, Uh& hNsv, Th& hS, Th& hZ, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { std::vector work(14 * n); std::vector iwork(12 * n); bdsvdx_initData(handle, n, dD, dE, hD, hE); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_bdsvdx(uplo, svect, srange, n, hD[0], hE[0], vl, vu, il, iu, hNsv[0], hS[0], hZ[0], ldz, work.data(), iwork.data(), hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } bdsvdx_initData(handle, n, dD, dE, hD, hE); // cold calls for(int iter = 0; iter < 2; iter++) { bdsvdx_initData(handle, n, dD, dE, hD, hE); CHECK_ROCBLAS_ERROR(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { bdsvdx_initData(handle, n, dD, dE, hD, hE); start = get_time_us_sync(stream); rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_bdsvdx(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); char svectC = argus.get("svect"); char srangeC = argus.get("srange"); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_svect svect = char2rocblas_svect(svectC); rocblas_srange srange = char2rocblas_srange(srangeC); T vl = T(argus.get("vl", 0)); T vu = T(argus.get("vu", srangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", srangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", srangeC == 'I' ? 1 : 0); rocblas_int n = argus.get("n"); rocblas_int ldz = argus.get("ldz", 2 * n); rocblas_int nsv_max = (srange == rocblas_srange_index ? iu - il + 1 : n); rocblas_int hot_calls = argus.iters; // check non-supported values if((uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) || (svect != rocblas_svect_none && svect != rocblas_svect_singular)) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (T*)nullptr, (T*)nullptr, vl, vu, il, iu, (rocblas_int*)nullptr, (T*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_D = n; size_t size_E = n; size_t size_S = nsv_max; size_t size_S_cpu = n; size_t size_Z = ldz * nsv_max; size_t size_Ifail = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_SRes = (argus.unit_check || argus.norm_check) ? size_S : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; size_t size_IfailRes = (argus.unit_check || argus.norm_check) ? size_Ifail : 0; // check invalid sizes bool invalid_size = (n < 0) || (svect == rocblas_svect_none && ldz < 1) || (svect != rocblas_svect_none && ldz < 2 * n) || (srange == rocblas_srange_value && (vl < 0 || vl >= vu)) || (srange == rocblas_srange_index && ((iu > n) || (n > 0 && il > iu))) || (srange == rocblas_srange_index && (il < 1 || iu < 0)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (T*)nullptr, (T*)nullptr, vl, vu, il, iu, (rocblas_int*)nullptr, (T*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_bdsvdx(handle, uplo, svect, srange, n, (T*)nullptr, (T*)nullptr, vl, vu, il, iu, (rocblas_int*)nullptr, (T*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations // host host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hS(size_S_cpu, 1, size_S_cpu, 1); // extra space for cpu_bdsvdx host_strided_batch_vector hSRes(size_SRes, 1, size_SRes, 1); host_strided_batch_vector hZ(size_Z, 1, size_Z, 1); host_strided_batch_vector hZRes(size_ZRes, 1, size_ZRes, 1); host_strided_batch_vector hNsv(1, 1, 1, 1); host_strided_batch_vector hNsvRes(1, 1, 1, 1); host_strided_batch_vector hIfailRes(size_IfailRes, 1, size_IfailRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); // device device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dS(size_S, 1, size_S, 1); device_strided_batch_vector dZ(size_Z, 1, size_Z, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector dIfail(size_Ifail, 1, size_Ifail, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); if(size_Ifail) CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_bdsvdx(handle, uplo, svect, srange, n, dD.data(), dE.data(), vl, vu, il, iu, dNsv.data(), dS.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) bdsvdx_getError(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo, hD, hE, hNsv, hNsvRes, hS, hSRes, hZ, hZRes, hIfailRes, hInfo, hInfoRes, &max_error); // collect performance data if(argus.timing) bdsvdx_getPerfData(handle, uplo, svect, srange, n, dD, dE, vl, vu, il, iu, dNsv, dS, dZ, ldz, dIfail, dInfo, hD, hE, hNsv, hS, hZ, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "svect", "srange", "n", "vl", "vu", "il", "iu", "ldz"); rocsolver_bench_output(uploC, svectC, srangeC, n, vl, vu, il, iu, ldz); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_BDSVDX(...) extern template void testing_bdsvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_BDSVDX, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_labrd.cpp000066400000000000000000000032441503202240500244610ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_labrd.hpp" #define TESTING_LABRD(...) template void testing_labrd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LABRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_labrd.hpp000066400000000000000000000453271503202240500244760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void labrd_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, T dA, const rocblas_int lda, S dD, S dE, U dTauq, U dTaup, T dX, const rocblas_int ldx, T dY, const rocblas_int ldy) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_labrd(nullptr, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, (T) nullptr, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, (S) nullptr, dE, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, (S) nullptr, dTauq, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, (U) nullptr, dTaup, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, dTauq, (U) nullptr, dX, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, (T) nullptr, ldx, dY, ldy), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_labrd(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, (T) nullptr, ldy), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, 0, n, 0, (T) nullptr, lda, dD, dE, dTauq, dTaup, (T) nullptr, ldx, dY, ldy), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, 0, 0, (T) nullptr, lda, dD, dE, dTauq, dTaup, dX, ldx, (T) nullptr, ldy), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, 0, dA, lda, (S) nullptr, (S) nullptr, (U) nullptr, (U) nullptr, (T) nullptr, ldx, (T) nullptr, ldy), rocblas_status_success); } template void testing_labrd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int nb = 1; rocblas_int lda = 1; rocblas_int ldx = 1; rocblas_int ldy = 1; // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTauq(1, 1, 1, 1); device_strided_batch_vector dTaup(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); device_strided_batch_vector dY(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTauq.memcheck()); CHECK_HIP_ERROR(dTaup.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dY.memcheck()); // check bad arguments labrd_checkBadArgs(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy); } template void labrd_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, Td& dA, const rocblas_int lda, Sd& dD, Sd& dE, Ud& dTauq, Ud& dTaup, Td& dX, const rocblas_int ldx, Td& dY, const rocblas_int ldy, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, Th& hX, Th& hY) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (m >= n && j == i + 1) || (m < n && i == j + 1)) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void labrd_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, Td& dA, const rocblas_int lda, Sd& dD, Sd& dE, Ud& dTauq, Ud& dTaup, Td& dX, const rocblas_int ldx, Td& dY, const rocblas_int ldy, Th& hA, Th& hARes, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, Th& hX, Th& hXRes, Th& hY, Th& hYRes, double* max_err) { // input data initialization labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); CHECK_HIP_ERROR(hYRes.transfer_from(dY)); // CPU lapack cpu_labrd(m, n, nb, hA[0], lda, hD[0], hE[0], hTauq[0], hTaup[0], hX[0], ldx, hY[0], ldy); // error is max(||hA - hARes|| / ||hA||, ||hX - hXRes|| / ||hX||, ||hY - // hYRes|| / ||hY||) (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; *max_err = 0; err = norm_error('F', m, n, lda, hA[0], hARes[0]); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', m - nb, nb, ldx, hX[0] + nb, hXRes[0] + nb); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', n - nb, nb, ldy, hY[0] + nb, hYRes[0] + nb); *max_err = err > *max_err ? err : *max_err; } template void labrd_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int nb, Td& dA, const rocblas_int lda, Sd& dD, Sd& dE, Ud& dTauq, Ud& dTaup, Td& dX, const rocblas_int ldx, Td& dY, const rocblas_int ldy, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, Th& hX, Th& hY, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); memset(hX[0], 0, ldx * nb * sizeof(T)); memset(hY[0], 0, ldy * nb * sizeof(T)); cpu_labrd(m, n, nb, hA[0], lda, hD[0], hE[0], hTauq[0], hTaup[0], hX[0], ldx, hY[0], ldy); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); // cold calls for(int iter = 0; iter < 2; iter++) { labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); CHECK_ROCBLAS_ERROR(rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { labrd_initData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY); start = get_time_us_sync(stream); rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_labrd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nb = argus.get("k", std::min(m, n)); rocblas_int lda = argus.get("lda", m); rocblas_int ldx = argus.get("ldx", m); rocblas_int ldy = argus.get("ldy", n); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = lda * n; size_t size_D = nb; size_t size_E = nb; size_t size_Q = nb; size_t size_P = nb; size_t size_X = ldx * nb; size_t size_Y = ldy * nb; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_XRes = (argus.unit_check || argus.norm_check) ? size_X : 0; size_t size_YRes = (argus.unit_check || argus.norm_check) ? size_Y : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nb < 0 || nb > std::min(m, n) || lda < m || ldx < m || ldy < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, nb, (T*)nullptr, lda, (S*)nullptr, (S*)nullptr, (T*)nullptr, (T*)nullptr, (T*)nullptr, ldx, (T*)nullptr, ldy), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_labrd(handle, m, n, nb, (T*)nullptr, lda, (S*)nullptr, (S*)nullptr, (T*)nullptr, (T*)nullptr, (T*)nullptr, ldx, (T*)nullptr, ldy)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hARes(size_ARes, 1, size_ARes, 1); host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hTauq(size_Q, 1, size_Q, 1); host_strided_batch_vector hTaup(size_P, 1, size_P, 1); host_strided_batch_vector hX(size_X, 1, size_X, 1); host_strided_batch_vector hXRes(size_XRes, 1, size_XRes, 1); host_strided_batch_vector hY(size_Y, 1, size_Y, 1); host_strided_batch_vector hYRes(size_YRes, 1, size_YRes, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dTauq(size_Q, 1, size_Q, 1); device_strided_batch_vector dTaup(size_P, 1, size_P, 1); device_strided_batch_vector dX(size_X, 1, size_X, 1); device_strided_batch_vector dY(size_Y, 1, size_Y, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_Q) CHECK_HIP_ERROR(dTauq.memcheck()); if(size_P) CHECK_HIP_ERROR(dTaup.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(size_Y) CHECK_HIP_ERROR(dY.memcheck()); // check quick return if(m == 0 || n == 0 || nb == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_labrd(handle, m, n, nb, dA.data(), lda, dD.data(), dE.data(), dTauq.data(), dTaup.data(), dX.data(), ldx, dY.data(), ldy), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) labrd_getError(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hARes, hD, hE, hTauq, hTaup, hX, hXRes, hY, hYRes, &max_error); // collect performance data if(argus.timing) labrd_getPerfData(handle, m, n, nb, dA, lda, dD, dE, dTauq, dTaup, dX, ldx, dY, ldy, hA, hD, hE, hTauq, hTaup, hX, hY, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using nb * max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb * std::max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "nb", "lda", "ldx", "ldy"); rocsolver_bench_output(m, n, nb, lda, ldx, ldy); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LABRD(...) extern template void testing_labrd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LABRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lacgv.cpp000066400000000000000000000032671503202240500244760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_lacgv.hpp" #define TESTING_LACGV(...) template void testing_lacgv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LACGV, FOREACH_COMPLEX_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lacgv.hpp000066400000000000000000000223241503202240500244760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void lacgv_checkBadArgs(const rocblas_handle handle, const I n, T dA, const I inc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(nullptr, n, dA, inc), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, n, (T) nullptr, inc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, (I)0, (T) nullptr, inc), rocblas_status_success); } template void testing_lacgv_bad_arg() { // safe arguments rocblas_local_handle handle; I n = 1; I inc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments lacgv_checkBadArgs(handle, n, dA.data(), inc); } template void lacgv_initData(const rocblas_handle handle, const I n, Td& dA, const I inc, Th& hA) { if(CPU) { rocblas_init(hA, true); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lacgv_getError(const rocblas_handle handle, const I n, Td& dA, const I inc, Th& hA, Th& hAr, double* max_err) { // initialize data lacgv_initData(handle, n, dA, inc, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_lacgv(handle, n, dA.data(), inc)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_lacgv(n, hA[0], inc); // error |hA - hAr| (elements must be identical) *max_err = 0; double diff; for(int j = 0; j < n; j++) { diff = std::abs(hAr[0][j * abs(inc)] - hA[0][j * abs(inc)]); *max_err = diff > *max_err ? diff : *max_err; } } template void lacgv_getPerfData(const rocblas_handle handle, const I n, Td& dA, const I inc, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { lacgv_initData(handle, n, dA, inc, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_lacgv(n, hA[0], inc); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lacgv_initData(handle, n, dA, inc, hA); // cold calls for(int iter = 0; iter < 2; iter++) { lacgv_initData(handle, n, dA, inc, hA); CHECK_ROCBLAS_ERROR(rocsolver_lacgv(handle, n, dA.data(), inc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { lacgv_initData(handle, n, dA, inc, hA); start = get_time_us_sync(stream); rocsolver_lacgv(handle, n, dA.data(), inc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lacgv(Arguments& argus) { // get arguments rocblas_local_handle handle; I n = argus.get("n"); I inc = argus.get("incx"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(n) * abs(inc); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || !inc); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, n, (T*)nullptr, inc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lacgv(handle, n, (T*)nullptr, inc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_lacgv(handle, n, dA.data(), inc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) lacgv_getError(handle, n, dA, inc, hA, hAr, &max_error); // collect performance data if(argus.timing) lacgv_getPerfData(handle, n, dA, inc, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // no tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 0); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "inc"); rocsolver_bench_output(n, inc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LACGV(...) extern template void testing_lacgv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LACGV, FOREACH_COMPLEX_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larf.cpp000066400000000000000000000032621503202240500243210ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_larf.hpp" #define TESTING_LARF(...) template void testing_larf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARF, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larf.hpp000066400000000000000000000332641503202240500243330ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void larf_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const I m, const I n, T dx, const I inc, T dt, T dA, const I lda) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_larf(nullptr, side, m, n, dx, inc, dt, dA, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, rocblas_side_both, m, n, dx, inc, dt, dA, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, side, m, n, (T) nullptr, inc, dt, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, side, m, n, dx, inc, (T) nullptr, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, side, m, n, dx, inc, dt, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, rocblas_side_left, (I)0, n, (T) nullptr, inc, (T) nullptr, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_larf(handle, rocblas_side_right, m, (I)0, (T) nullptr, inc, (T) nullptr, (T) nullptr, lda), rocblas_status_success); } template void testing_larf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; I m = 1; I n = 1; I inc = 1; I lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dx(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check bad arguments larf_checkBadArgs(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda); } template void larf_initData(const rocblas_handle handle, const rocblas_side side, const I m, const I n, Td& dx, const I inc, Td& dt, Td& dA, const I lda, Th& xx, Th& hx, Th& ht, Th& hA) { if(CPU) { I order = xx.n(); rocblas_init(hA, true); rocblas_init(xx, true); // compute householder reflector cpu_larfg(order, xx[0], xx[0] + abs(inc), abs(inc), ht[0]); xx[0][0] = 1; for(I i = 0; i < order; i++) { if(inc < 0) hx[0][i * abs(inc)] = xx[0][(order - 1 - i) * abs(inc)]; else hx[0][i * inc] = xx[0][i * inc]; } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dx.transfer_from(hx)); CHECK_HIP_ERROR(dt.transfer_from(ht)); } } template void larf_getError(const rocblas_handle handle, const rocblas_side side, const I m, const I n, Td& dx, const I inc, Td& dt, Td& dA, const I lda, Th& xx, Th& hx, Th& ht, Th& hA, Th& hAr, double* max_err) { size_t size_w = (side == rocblas_side_left) ? size_t(n) : size_t(m); std::vector hw(size_w); // initialize data larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_larf(side, m, n, hx[0], inc, ht[0], hA[0], lda, hw.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void larf_getPerfData(const rocblas_handle handle, const rocblas_side side, const I m, const I n, Td& dx, const I inc, Td& dt, Td& dA, const I lda, Th& xx, Th& hx, Th& ht, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_w = (side == rocblas_side_left) ? size_t(n) : size_t(m); std::vector hw(size_w); if(!perf) { larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larf(side, m, n, hx[0], inc, ht[0], hA[0], lda, hw.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); // cold calls for(int iter = 0; iter < 2; iter++) { larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); CHECK_ROCBLAS_ERROR( rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larf_initData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA); start = get_time_us_sync(stream); rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larf(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); I m = argus.get("m"); I n = argus.get("n", m); I inc = argus.get("incx"); I lda = argus.get("lda", m); rocblas_side side = char2rocblas_side(sideC); rocblas_int hot_calls = argus.iters; // check non-supported values if(side != rocblas_side_left && side != rocblas_side_right) { EXPECT_ROCBLAS_STATUS( rocsolver_larf(handle, side, m, n, (T*)nullptr, inc, (T*)nullptr, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = size_t(lda) * n; size_t size_x = left ? size_t(m) : size_t(n); size_t stx = size_x * abs(inc); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || !inc || lda < m); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_larf(handle, side, m, n, (T*)nullptr, inc, (T*)nullptr, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY( rocsolver_larf(handle, side, m, n, (T*)nullptr, inc, (T*)nullptr, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hx(size_x, abs(inc), stx, 1); host_strided_batch_vector xx(size_x, abs(inc), stx, 1); host_strided_batch_vector ht(1, 1, 1, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dx(size_x, abs(inc), stx, 1); device_strided_batch_vector dt(1, 1, 1, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_x) CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_larf(handle, side, m, n, dx.data(), inc, dt.data(), dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larf_getError(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA, hAr, &max_error); // collect performance data if(argus.timing) larf_getPerfData(handle, side, m, n, dx, inc, dt, dA, lda, xx, hx, ht, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using size_x * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, size_x); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "m", "n", "inc", "lda"); rocsolver_bench_output(sideC, m, n, inc, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARF(...) extern template void testing_larf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARF, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larfb.cpp000066400000000000000000000032441503202240500244630ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_larfb.hpp" #define TESTING_LARFB(...) template void testing_larfb<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARFB, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larfb.hpp000066400000000000000000000510731503202240500244730ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void larfb_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dV, const rocblas_int ldv, T dT, const rocblas_int ldt, T dA, const rocblas_int lda) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_larfb(nullptr, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, rocblas_side(0), trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, rocblas_operation(0), direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, rocblas_direct(0), storev, m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, rocblas_storev(0), m, n, k, dV, ldv, dT, ldt, dA, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T) nullptr, ldv, dT, ldt, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV, ldv, (T) nullptr, ldt, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, rocblas_side_left, trans, direct, storev, 0, n, k, (T) nullptr, ldv, dT, ldt, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, rocblas_side_right, trans, direct, storev, m, 0, k, (T) nullptr, ldv, dT, ldt, (T) nullptr, lda), rocblas_status_success); } template void testing_larfb_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_direct direct = rocblas_forward_direction; rocblas_storev storev = rocblas_column_wise; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int ldv = 1; rocblas_int ldt = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dT(1, 1, 1, 1); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dT.memcheck()); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments larfb_checkBadArgs(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda); } template void larfb_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dT, const rocblas_int ldt, Td& dA, const rocblas_int lda, Th& hV, Th& hT, Th& hA, std::vector& hW, size_t sizeW) { if(CPU) { bool left = (side == rocblas_side_left); bool forward = (direct == rocblas_forward_direction); bool column = (storev == rocblas_column_wise); std::vector htau(k); rocblas_init(hV, true); rocblas_init(hA, true); rocblas_init(hT, true); // scale to avoid singularities // create householder reflectors and triangular factor if(left) { if(column) { for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_geqrf(m, k, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_geqlf(m, k, hV[0], ldv, htau.data(), hW.data(), sizeW); } else { for(int i = 0; i < k; ++i) { for(int j = 0; j < m; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_gelqf(k, m, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_gerqf(k, m, hV[0], ldv, htau.data(), hW.data(), sizeW); } cpu_larft(direct, storev, m, k, hV[0], ldv, htau.data(), hT[0], ldt); } else { if(column) { for(int i = 0; i < n; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_geqrf(n, k, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_geqlf(n, k, hV[0], ldv, htau.data(), hW.data(), sizeW); } else { for(int i = 0; i < k; ++i) { for(int j = 0; j < n; ++j) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(forward) cpu_gelqf(k, n, hV[0], ldv, htau.data(), hW.data(), sizeW); else cpu_gerqf(k, n, hV[0], ldv, htau.data(), hW.data(), sizeW); } cpu_larft(direct, storev, n, k, hV[0], ldv, htau.data(), hT[0], ldt); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dV.transfer_from(hV)); CHECK_HIP_ERROR(dT.transfer_from(hT)); CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void larfb_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dT, const rocblas_int ldt, Td& dA, const rocblas_int lda, Th& hV, Th& hT, Th& hA, Th& hAr, double* max_err) { bool left = (side == rocblas_side_left); rocblas_int ldw = left ? n : m; size_t sizeW = size_t(ldw) * k; std::vector hW(sizeW); // initialize data larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_larfb(side, trans, direct, storev, m, n, k, hV[0], ldv, hT[0], ldt, hA[0], lda, hW.data(), ldw); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void larfb_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dT, const rocblas_int ldt, Td& dA, const rocblas_int lda, Th& hV, Th& hT, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { bool left = (side == rocblas_side_left); rocblas_int ldw = left ? n : m; size_t sizeW = size_t(ldw) * k; std::vector hW(sizeW); if(!perf) { larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larfb(side, trans, direct, storev, m, n, k, hV[0], ldv, hT[0], ldt, hA[0], lda, hW.data(), ldw); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); // cold calls for(int iter = 0; iter < 2; iter++) { larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); CHECK_ROCBLAS_ERROR(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larfb_initData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hW, sizeW); start = get_time_us_sync(stream); rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larfb(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); char directC = argus.get("direct"); char storevC = argus.get("storev"); rocblas_int k = argus.get("k"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int ldv = argus.get("ldv", storevC == 'R' ? k : (sideC == 'L' ? m : n)); rocblas_int lda = argus.get("lda", m); rocblas_int ldt = argus.get("ldt", k); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_direct direct = char2rocblas_direct(directC); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_int hot_calls = argus.iters; // check non-supported values if(side != rocblas_side_left && side != rocblas_side_right) { EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T*)nullptr, ldv, (T*)nullptr, ldt, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool row = (storev == rocblas_row_wise); bool left = (side == rocblas_side_left); size_t size_V = size_t(ldv) * k; if(row) size_V = left ? size_t(ldv) * m : size_t(ldv) * n; size_t size_T = size_t(ldt) * k; size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 1 || ldt < k || lda < m || (row && ldv < k) || (!row && !left && ldv < n) || (!row && left && ldv < m)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T*)nullptr, ldv, (T*)nullptr, ldt, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, (T*)nullptr, ldv, (T*)nullptr, ldt, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hT(size_T, 1, size_T, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hV(size_V, 1, size_V, 1); device_strided_batch_vector dT(size_T, 1, size_T, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dV(size_V, 1, size_V, 1); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_T) CHECK_HIP_ERROR(dT.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_larfb(handle, side, trans, direct, storev, m, n, k, dV.data(), ldv, dT.data(), ldt, dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larfb_getError(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, hAr, &max_error); // collect performance data if(argus.timing) larfb_getPerfData(handle, side, trans, direct, storev, m, n, k, dV, ldv, dT, ldt, dA, lda, hV, hT, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "direct", "storev", "m", "n", "k", "ldv", "ldt", "lda"); rocsolver_bench_output(sideC, transC, directC, storevC, m, n, k, ldv, ldt, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARFB(...) extern template void testing_larfb<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARFB, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larfg.cpp000066400000000000000000000032661503202240500244740ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_larfg.hpp" #define TESTING_LARFG(...) template void testing_larfg<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARFG, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larfg.hpp000066400000000000000000000263121503202240500244760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void larfg_checkBadArgs(const rocblas_handle handle, const I n, T da, T dx, const I inc, T dt) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_larfg(nullptr, n, da, dx, inc, dt), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, (T) nullptr, dx, inc, dt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, da, (T) nullptr, inc, dt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, da, dx, inc, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, (I)0, (T) nullptr, (T) nullptr, inc, (T) nullptr), rocblas_status_success); } template void testing_larfg_bad_arg() { // safe arguments rocblas_local_handle handle; I n = 2; I inc = 1; // memory allocation device_strided_batch_vector da(1, 1, 1, 1); device_strided_batch_vector dx(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); CHECK_HIP_ERROR(da.memcheck()); CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check bad arguments larfg_checkBadArgs(handle, n, da.data(), dx.data(), inc, dt.data()); } template void larfg_initData(const rocblas_handle handle, const I n, Td& da, Td& dx, const I inc, Td& dt, Th& ha, Th& hx, Th& ht) { if(CPU) { rocblas_init(ha, true); rocblas_init(hx, true); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(da.transfer_from(ha)); CHECK_HIP_ERROR(dx.transfer_from(hx)); } } template void larfg_getError(const rocblas_handle handle, const I n, Td& da, Td& dx, const I inc, Td& dt, Th& ha, Th& hx, Th& hxr, Th& ht, double* max_err) { // initialize data larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data())); CHECK_HIP_ERROR(hxr.transfer_from(dx)); // CPU lapack cpu_larfg(n, ha[0], hx[0], inc, ht[0]); // error is ||hx - hxr|| (not necessary to check tau, for now) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using norm-1 which is infinity norm for this data setup *max_err = norm_error('O', 1, n - 1, inc, hx[0], hxr[0]); } template void larfg_getPerfData(const rocblas_handle handle, const I n, Td& da, Td& dx, const I inc, Td& dt, Th& ha, Th& hx, Th& ht, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larfg(n, ha[0], hx[0], inc, ht[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); // cold calls for(int iter = 0; iter < 2; iter++) { larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); CHECK_ROCBLAS_ERROR(rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larfg_initData(handle, n, da, dx, inc, dt, ha, hx, ht); start = get_time_us_sync(stream); rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larfg(Arguments& argus) { // get arguments rocblas_local_handle handle; I n = argus.get("n"); I inc = argus.get("incx"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes // size_x could be zero in test cases that are not quick-return or invalid // cases setting it to one to avoid possible memory access errors in the rest // of the unit test size_t size_x = n > 1 ? size_t(n - 1) : 1; size_t stx = size_x * inc; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_xr = (argus.unit_check || argus.norm_check) ? size_x : 0; size_t stxr = (argus.unit_check || argus.norm_check) ? stx : 0; // check invalid sizes bool invalid_size = (n < 0 || inc < 1); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, (T*)nullptr, (T*)nullptr, inc, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_larfg(handle, n, (T*)nullptr, (T*)nullptr, inc, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hx(size_x, inc, stx, 1); host_strided_batch_vector hxr(size_xr, inc, stxr, 1); host_strided_batch_vector ha(1, 1, 1, 1); host_strided_batch_vector ht(1, 1, 1, 1); device_strided_batch_vector dx(size_x, inc, stx, 1); device_strided_batch_vector da(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); CHECK_HIP_ERROR(da.memcheck()); if(size_x) CHECK_HIP_ERROR(dx.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_larfg(handle, n, da.data(), dx.data(), inc, dt.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larfg_getError(handle, n, da, dx, inc, dt, ha, hx, hxr, ht, &max_error); // collect performance data if(argus.timing) larfg_getPerfData(handle, n, da, dx, inc, dt, ha, hx, ht, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "inc"); rocsolver_bench_output(n, inc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARFG(...) extern template void testing_larfg<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARFG, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larft.cpp000066400000000000000000000032441503202240500245050ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_larft.hpp" #define TESTING_LARFT(...) template void testing_larft<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LARFT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_larft.hpp000066400000000000000000000363531503202240500245210ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void larft_checkBadArgs(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, T dV, const rocblas_int ldv, T dt, T dT, const rocblas_int ldt) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_larft(nullptr, direct, storev, n, k, dV, ldv, dt, dT, ldt), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_larft(handle, rocblas_direct(0), storev, n, k, dV, ldv, dt, dT, ldt), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_larft(handle, direct, rocblas_storev(0), n, k, dV, ldv, dt, dT, ldt), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, (T) nullptr, ldv, dt, dT, ldt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, dV, ldv, (T) nullptr, dT, ldt), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, dV, ldv, dt, (T) nullptr, ldt), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, 0, k, (T) nullptr, ldv, dt, dT, ldt), rocblas_status_success); } template void testing_larft_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_direct direct = rocblas_forward_direction; rocblas_storev storev = rocblas_column_wise; rocblas_int k = 1; rocblas_int n = 1; rocblas_int ldv = 1; rocblas_int ldt = 1; // memory allocation device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dt(1, 1, 1, 1); device_strided_batch_vector dT(1, 1, 1, 1); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dT.memcheck()); CHECK_HIP_ERROR(dt.memcheck()); // check bad arguments larft_checkBadArgs(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt); } template void larft_initData(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dt, Td& dT, const rocblas_int ldt, Th& hV, Th& ht, Th& hT, std::vector& hw, size_t size_w) { if(CPU) { rocblas_init(hV, true); // scale to avoid singularities // and create householder reflectors if(storev == rocblas_column_wise) { for(int j = 0; j < k; ++j) { for(int i = 0; i < n; ++i) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(direct == rocblas_forward_direction) cpu_geqrf(n, k, hV[0], ldv, ht[0], hw.data(), k); else cpu_geqlf(n, k, hV[0], ldv, ht[0], hw.data(), k); } else { for(int j = 0; j < n; ++j) { for(int i = 0; i < k; ++i) { if(i == j) hV[0][i + j * ldv] += 400; else hV[0][i + j * ldv] -= 4; } } if(direct == rocblas_forward_direction) cpu_gelqf(k, n, hV[0], ldv, ht[0], hw.data(), k); else cpu_gerqf(k, n, hV[0], ldv, ht[0], hw.data(), k); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dV.transfer_from(hV)); CHECK_HIP_ERROR(dt.transfer_from(ht)); } } template void larft_getError(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dt, Td& dT, const rocblas_int ldt, Th& hV, Th& ht, Th& hT, Th& hTr, double* max_err) { size_t size_w = size_t(k); std::vector hw(size_w); // initialize data larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt)); CHECK_HIP_ERROR(hTr.transfer_from(dT)); // CPU lapack cpu_larft(direct, storev, n, k, hV[0], ldv, ht[0], hT[0], ldt); // error is ||hT - hTr|| / ||hT|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = (direct == rocblas_forward_direction) ? norm_error_upperTr('F', k, k, ldt, hT[0], hTr[0]) : norm_error_lowerTr('F', k, k, ldt, hT[0], hTr[0]); } template void larft_getPerfData(const rocblas_handle handle, const rocblas_direct direct, const rocblas_storev storev, const rocblas_int n, const rocblas_int k, Td& dV, const rocblas_int ldv, Td& dt, Td& dT, const rocblas_int ldt, Th& hV, Th& ht, Th& hT, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_w = size_t(k); std::vector hw(size_w); if(!perf) { larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_larft(direct, storev, n, k, hV[0], ldv, ht[0], hT[0], ldt); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); // cold calls for(int iter = 0; iter < 2; iter++) { larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); CHECK_ROCBLAS_ERROR(rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { larft_initData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hw, size_w); start = get_time_us_sync(stream); rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_larft(Arguments& argus) { // get arguments rocblas_local_handle handle; char directC = argus.get("direct"); char storevC = argus.get("storev"); rocblas_int k = argus.get("k"); rocblas_int n = argus.get("n"); rocblas_int ldv = argus.get("ldv", storevC == 'C' ? n : k); rocblas_int ldt = argus.get("ldt", k); rocblas_direct direct = char2rocblas_direct(directC); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes bool row = (storev == rocblas_row_wise); size_t size_T = size_t(ldt) * k; size_t size_tau = size_t(k); size_t size_V = row ? size_t(ldv) * n : size_t(ldv) * k; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Tr = (argus.unit_check || argus.norm_check) ? size_T : 0; // check invalid sizes bool invalid_size = (n < 0 || k < 1 || ldt < k || (row && ldv < k) || (!row && ldv < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, (T*)nullptr, ldv, (T*)nullptr, (T*)nullptr, ldt), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_larft(handle, direct, storev, n, k, (T*)nullptr, ldv, (T*)nullptr, (T*)nullptr, ldt)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hT(size_T, 1, size_T, 1); host_strided_batch_vector hTr(size_Tr, 1, size_Tr, 1); host_strided_batch_vector ht(size_tau, 1, size_tau, 1); host_strided_batch_vector hV(size_V, 1, size_V, 1); device_strided_batch_vector dT(size_T, 1, size_T, 1); device_strided_batch_vector dt(size_tau, 1, size_tau, 1); device_strided_batch_vector dV(size_V, 1, size_V, 1); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_T) CHECK_HIP_ERROR(dT.memcheck()); if(size_tau) CHECK_HIP_ERROR(dt.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_larft(handle, direct, storev, n, k, dV.data(), ldv, dt.data(), dT.data(), ldt), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) larft_getError(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, hTr, &max_error); // collect performance data if(argus.timing) larft_getPerfData(handle, direct, storev, n, k, dV, ldv, dt, dT, ldt, hV, ht, hT, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("direct", "storev", "n", "k", "ldv", "ldt"); rocsolver_bench_output(directC, storevC, n, k, ldv, ldt); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LARFT(...) extern template void testing_larft<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LARFT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lasr.cpp000066400000000000000000000032331503202240500243340ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_lasr.hpp" #define TESTING_LASR(...) template void testing_lasr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LASR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lasr.hpp000066400000000000000000000374101503202240500243450ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void lasr_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_pivot pivot, const rocblas_direct direct, const rocblas_int m, const rocblas_int n, S dC, S dS, T dA, const rocblas_int lda) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lasr(nullptr, side, pivot, direct, m, n, dC, dS, dA, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_lasr(handle, rocblas_side(0), pivot, direct, m, n, dC, dS, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_lasr(handle, side, rocblas_pivot(0), direct, m, n, dC, dS, dA, lda), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_lasr(handle, side, pivot, rocblas_direct(0), m, n, dC, dS, dA, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, (S) nullptr, dS, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, dC, (S) nullptr, dA, lda), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, dC, dS, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_left, pivot, direct, 0, n, (S) nullptr, (S) nullptr, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_lasr(handle, rocblas_side_right, pivot, direct, 0, n, dC, dS, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_right, pivot, direct, m, 0, (S) nullptr, (S) nullptr, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_lasr(handle, rocblas_side_left, pivot, direct, m, 0, dC, dS, (T) nullptr, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_left, pivot, direct, 1, n, (S) nullptr, (S) nullptr, dA, lda), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, rocblas_side_right, pivot, direct, m, 1, (S) nullptr, (S) nullptr, dA, lda), rocblas_status_success); } template void testing_lasr_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; // memory allocation device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments lasr_checkBadArgs(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda); } template void lasr_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_pivot pivot, const rocblas_direct direct, const rocblas_int m, const rocblas_int n, Sd& dC, Sd& dS, Td& dA, const rocblas_int lda, Sh& hC, Sh& hS, Th& hA) { if(CPU) { using S = decltype(std::real(T{})); rocblas_init(hA, true); // construct C and S such that C^2 + S^2 = 1 rocblas_init(hC, true); rocblas_int size = (side == rocblas_side_left) ? m - 1 : n - 1; for(rocblas_int j = 0; j < size; ++j) { S temp = hC[0][j]; temp = (temp - 5) / 5.0; hC[0][j] = temp; hS[0][j] = sqrt(1 - temp * temp); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dC.transfer_from(hC)); CHECK_HIP_ERROR(dS.transfer_from(hS)); CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lasr_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_pivot pivot, const rocblas_direct direct, const rocblas_int m, const rocblas_int n, Sd& dC, Sd& dS, Td& dA, const rocblas_int lda, Sh& hC, Sh& hS, Th& hA, Th& hAr, double* max_err) { // initialize data lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_lasr(side, pivot, direct, m, n, hC[0], hS[0], hA[0], lda); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void lasr_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_pivot pivot, const rocblas_direct direct, const rocblas_int m, const rocblas_int n, Sd& dC, Sd& dS, Td& dA, const rocblas_int lda, Sh& hC, Sh& hS, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_lasr(side, pivot, direct, m, n, hC[0], hS[0], hA[0], lda); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); // cold calls for(int iter = 0; iter < 2; iter++) { lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); CHECK_ROCBLAS_ERROR(rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { lasr_initData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA); start = get_time_us_sync(stream); rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lasr(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char pivotC = argus.get("pivot"); char directC = argus.get("direct"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_side side = char2rocblas_side(sideC); rocblas_pivot pivot = char2rocblas_pivot(pivotC); rocblas_direct direct = char2rocblas_direct(directC); rocblas_int hot_calls = argus.iters; // check non-supported values if(side != rocblas_side_left && side != rocblas_side_right) { EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lasr(handle, side, pivot, direct, m, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // determine sizes bool left = (side == rocblas_side_left); bool right = (side == rocblas_side_right); size_t size_CS = 0; if(left && m > 1) size_CS = size_t(m - 1); if(right && n > 1) size_CS = size_t(n - 1); size_t size_A = size_t(lda) * n; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lasr(handle, side, pivot, direct, m, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hC(size_CS, 1, size_CS, 1); host_strided_batch_vector hS(size_CS, 1, size_CS, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_CS, 1, size_CS, 1); device_strided_batch_vector dS(size_CS, 1, size_CS, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_CS) { CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); } // check quick return bool quickreturn = (left && m < 2) || (right && n < 2) || n == 0 || m == 0; if(quickreturn) { EXPECT_ROCBLAS_STATUS( rocsolver_lasr(handle, side, pivot, direct, m, n, dC.data(), dS.data(), dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check computations if(argus.unit_check || argus.norm_check) lasr_getError(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA, hAr, &max_error); // collect performance data if(argus.timing && hot_calls > 0) lasr_getPerfData(handle, side, pivot, direct, m, n, dC, dS, dA, lda, hC, hS, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "pivot", "direct", "m", "n", "lda"); rocsolver_bench_output(sideC, pivotC, directC, m, n, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LASR(...) extern template void testing_lasr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LASR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_laswp.cpp000066400000000000000000000032441503202240500245230ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_laswp.hpp" #define TESTING_LASWP(...) template void testing_laswp<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LASWP, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_laswp.hpp000066400000000000000000000301161503202240500245260ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void laswp_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, U dIpiv, const rocblas_int inc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_laswp(nullptr, n, dA, lda, k1, k2, dIpiv, inc), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, n, (T) nullptr, lda, k1, k2, dIpiv, inc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, n, dA, lda, k1, k2, (U) nullptr, inc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, 0, (T) nullptr, lda, k1, k2, dIpiv, inc), rocblas_status_success); } template void testing_laswp_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int k1 = 1; rocblas_int k2 = 2; rocblas_int inc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments laswp_checkBadArgs(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc); } template void laswp_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, Ud& dIpiv, const rocblas_int inc, Th& hA, Uh& hIpiv) { if(CPU) { // for simplicity consider number of rows m = lda rocblas_init(hA, true); rocblas_init(hIpiv, true); // put indices in range [1, x] // for simplicity, consider x = lda as this is the number of rows for(rocblas_int i = 0; i < hIpiv.n(); ++i) hIpiv[0][i] = hIpiv[0][i] * lda < 10 ? 1 : hIpiv[0][i] * lda / 10; } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void laswp_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, Ud& dIpiv, const rocblas_int inc, Th& hA, Th& hAr, Uh& hIpiv, double* max_err) { // initialize data laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_laswp(n, hA[0], lda, k1, k2, hIpiv[0], inc); // error |hA - hAr| (elements must be identical) *max_err = 0; double diff; for(int i = 0; i < lda; i++) { for(int j = 0; j < n; j++) { diff = std::abs(hAr[0][i + j * lda] - hA[0][i + j * lda]); *max_err = diff > *max_err ? diff : *max_err; } } } template void laswp_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int k1, const rocblas_int k2, Ud& dIpiv, const rocblas_int inc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_laswp(n, hA[0], lda, k1, k2, hIpiv[0], inc); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { laswp_initData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_laswp(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int k1 = argus.get("k1"); rocblas_int k2 = argus.get("k2"); rocblas_int lda = argus.get("lda", k2); rocblas_int inc = argus.get("incx"); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = k1 + size_t(k2 - k1) * abs(inc); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < 1 || !inc || k1 < 1 || k2 < 1 || k2 < k1); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_laswp(handle, n, (T*)nullptr, lda, k1, k2, (rocblas_int*)nullptr, inc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY( rocsolver_laswp(handle, n, (T*)nullptr, lda, k1, k2, (rocblas_int*)nullptr, inc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_laswp(handle, n, dA.data(), lda, k1, k2, dIpiv.data(), inc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) laswp_getError(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) laswp_getPerfData(handle, n, dA, lda, k1, k2, dIpiv, inc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // no tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 0); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "lda", "k1", "k2", "inc"); rocsolver_bench_output(n, lda, k1, k2, inc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LASWP(...) extern template void testing_laswp<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LASWP, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lasyf.cpp000066400000000000000000000032441503202240500245130ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_lasyf.hpp" #define TESTING_LASYF(...) template void testing_lasyf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LASYF, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lasyf.hpp000066400000000000000000000416201503202240500245200ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void lasyf_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nb, rocblas_int* kb, T dA, const rocblas_int lda, rocblas_int* ipiv, rocblas_int* info) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(nullptr, uplo, n, nb, kb, dA, lda, ipiv, info), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, rocblas_fill_full, n, nb, kb, dA, lda, ipiv, info), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, dA, lda, ipiv, info), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, kb, (T) nullptr, lda, ipiv, info), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, n, nb, kb, dA, lda, (rocblas_int*)nullptr, info), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, n, nb, kb, dA, lda, ipiv, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_lasyf(handle, uplo, 0, 0, kb, (T) nullptr, lda, (rocblas_int*)nullptr, info), rocblas_status_success); } template void testing_lasyf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int nb = 1; rocblas_int lda = 1; // memory allocations device_strided_batch_vector dKB(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dKB.memcheck()); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments lasyf_checkBadArgs(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data()); } template void lasyf_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[0][i + j * lda]; hA[0][i + j * lda] = hA[0][n - 1 - i + j * lda]; hA[0][n - 1 - i + j * lda] = tmp; } } if(singular) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[0][i + j * lda] = 0; hA[0][j + i * lda] = 0; } j = n / 2; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[0][i + j * lda] = 0; hA[0][j + i * lda] = 0; } j = n - 1; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[0][i + j * lda] = 0; hA[0][j + i * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lasyf_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nb, Ud& dKB, Td& dA, const rocblas_int lda, Ud& dIpiv, Ud& dInfo, Uh& hKB, Uh& hKBRes, Th& hA, Th& hARes, Uh& hIpiv, Uh& hIpivRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int ldw = n; int lwork = ldw * nb; std::vector work(lwork); // input data initialization lasyf_initData(handle, n, dA, lda, hA, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data())); CHECK_HIP_ERROR(hKBRes.transfer_from(dKB)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hIpivRes.transfer_from(dIpiv)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack cpu_lasyf(uplo, n, nb, hKB[0], hA[0], lda, hIpiv[0], work.data(), ldw, hInfo[0]); // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; *max_err = 0; err = norm_error('F', n, n, lda, hA[0], hARes[0]); *max_err = err > *max_err ? err : *max_err; // also check pivoting (count the number of incorrect pivots) err = 0; if(uplo == rocblas_fill_upper) { for(rocblas_int i = n - hKBRes[0][0]; i < n; ++i) { EXPECT_EQ(hIpiv[0][i], hIpivRes[0][i]) << "where i = " << i; if(hIpiv[0][i] != hIpivRes[0][i]) err++; } } else { for(rocblas_int i = 0; i < hKBRes[0][0]; ++i) { EXPECT_EQ(hIpiv[0][i], hIpivRes[0][i]) << "where i = " << i; if(hIpiv[0][i] != hIpivRes[0][i]) err++; } } *max_err = err > *max_err ? err : *max_err; // also check kb err = 0; EXPECT_EQ(hKB[0][0], hKBRes[0][0]); if(hKB[0][0] != hKBRes[0][0]) err++; *max_err += err; // also check info err = 0; EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) err++; *max_err += err; } template void lasyf_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nb, Ud& dKB, Td& dA, const rocblas_int lda, Ud& dIpiv, Ud& dInfo, Uh& hKB, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { int ldw = n; int lwork = ldw * nb; std::vector work(lwork); if(!perf) { lasyf_initData(handle, n, dA, lda, hA, singular); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); cpu_lasyf(uplo, n, nb, hKB[0], hA[0], lda, hIpiv[0], work.data(), ldw, hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lasyf_initData(handle, n, dA, lda, hA, singular); // cold calls for(int iter = 0; iter < 2; iter++) { lasyf_initData(handle, n, dA, lda, hA, singular); CHECK_ROCBLAS_ERROR(rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { lasyf_initData(handle, n, dA, lda, hA, singular); start = get_time_us_sync(stream); rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lasyf(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nb = argus.get("nb", n); rocblas_int lda = argus.get("lda", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, (T*)nullptr, lda, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = lda * n; size_t size_Ipiv = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_IpivRes = (argus.unit_check || argus.norm_check) ? size_Ipiv : 0; // check invalid sizes bool invalid_size = (n < 0 || nb < 0 || nb > n || lda < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, (T*)nullptr, lda, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lasyf(handle, uplo, n, nb, (rocblas_int*)nullptr, (T*)nullptr, lda, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hKB(1, 1, 1, 1); host_strided_batch_vector hKBRes(1, 1, 1, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hARes(size_ARes, 1, size_ARes, 1); host_strided_batch_vector hIpiv(size_Ipiv, 1, size_Ipiv, 1); host_strided_batch_vector hIpivRes(size_IpivRes, 1, size_IpivRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dKB(1, 1, 1, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_Ipiv, 1, size_Ipiv, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Ipiv) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dKB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_lasyf(handle, uplo, n, nb, dKB.data(), dA.data(), lda, dIpiv.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) lasyf_getError(handle, uplo, n, nb, dKB, dA, lda, dIpiv, dInfo, hKB, hKBRes, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) lasyf_getPerfData(handle, uplo, n, nb, dKB, dA, lda, dIpiv, dInfo, hKB, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "nb", "lda"); rocsolver_bench_output(uploC, n, nb, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LASYF(...) extern template void testing_lasyf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LASYF, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_latrd.cpp000066400000000000000000000032441503202240500245030ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_latrd.hpp" #define TESTING_LATRD(...) template void testing_latrd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LATRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_latrd.hpp000066400000000000000000000371661503202240500245220ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void latrd_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, S dE, T dTau, T dW, const rocblas_int ldw) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_latrd(nullptr, uplo, n, k, dA, lda, dE, dTau, dW, ldw), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, rocblas_fill_full, n, k, dA, lda, dE, dTau, dW, ldw), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, (T) nullptr, lda, dE, dTau, dW, ldw), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA, lda, (S) nullptr, dTau, dW, ldw), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA, lda, dE, (T) nullptr, dW, ldw), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA, lda, dE, dTau, (T) nullptr, ldw), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, 0, dA, lda, dE, dTau, (T) nullptr, ldw), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, 0, 0, (T) nullptr, lda, (S) nullptr, (T) nullptr, (T) nullptr, ldw), rocblas_status_success); } template void testing_latrd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int k = 1; rocblas_int lda = 1; rocblas_int ldw = 1; // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTau(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTau.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); // check bad arguments latrd_checkBadArgs(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw); } template , int> = 0> void latrd_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (i == j + 1) || (i == j - 1)) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template , int> = 0> void latrd_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[0][i + j * lda] = hA[0][i + j * lda].real() + 400; else if((i == j + 1) || (i == j - 1)) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void latrd_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Sd& dE, Td& dTau, Td& dW, const rocblas_int ldw, Th& hA, Th& hARes, Sh& hE, Th& hTau, Th& hW, Th& hWRes, double* max_err) { // input data initialization latrd_initData(handle, n, dA, lda, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); // CPU lapack cpu_latrd(uplo, n, k, hA[0], lda, hE[0], hTau[0], hW[0], ldw); // error is max(||hA - hARes|| / ||hA||, ||hW - hWRes|| / ||hW||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY // ISSUES. IT MIGHT BE REVISITED IN THE FUTURE) using frobenius norm double err; rocblas_int offset = (uplo == rocblas_fill_lower) ? k : 0; *max_err = 0; err = norm_error('F', n, n, lda, hA[0], hARes[0]); *max_err = err > *max_err ? err : *max_err; err = norm_error('F', n - k, k, ldw, hW[0] + offset, hWRes[0] + offset); *max_err = err > *max_err ? err : *max_err; } template void latrd_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Sd& dE, Td& dTau, Td& dW, const rocblas_int ldw, Th& hA, Sh& hE, Th& hTau, Th& hW, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { latrd_initData(handle, n, dA, lda, hA); // cpu-lapack performance *cpu_time_used = get_time_us_no_sync(); memset(hW[0], 0, ldw * k * sizeof(T)); cpu_latrd(uplo, n, k, hA[0], lda, hE[0], hTau[0], hW[0], ldw); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } latrd_initData(handle, n, dA, lda, hA); // cold calls for(int iter = 0; iter < 2; iter++) { latrd_initData(handle, n, dA, lda, hA); CHECK_ROCBLAS_ERROR(rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { latrd_initData(handle, n, dA, lda, hA); start = get_time_us_sync(stream); rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_latrd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int k = argus.get("k", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldw = argus.get("ldw", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, (T*)nullptr, lda, (S*)nullptr, (T*)nullptr, (T*)nullptr, ldw), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = lda * n; size_t size_E = n; size_t size_tau = n; size_t size_W = ldw * k; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0 || k < 0 || k > n || lda < n || ldw < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, (T*)nullptr, lda, (S*)nullptr, (T*)nullptr, (T*)nullptr, ldw), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_latrd(handle, uplo, n, k, (T*)nullptr, lda, (S*)nullptr, (T*)nullptr, (T*)nullptr, ldw)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hARes(size_ARes, 1, size_ARes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hTau(size_tau, 1, size_tau, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hWRes(size_WRes, 1, size_WRes, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dTau(size_tau, 1, size_tau, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_tau) CHECK_HIP_ERROR(dTau.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); // check quick return if(k == 0 || n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_latrd(handle, uplo, n, k, dA.data(), lda, dE.data(), dTau.data(), dW.data(), ldw), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) latrd_getError(handle, uplo, n, k, dA, lda, dE, dTau, dW, ldw, hA, hARes, hE, hTau, hW, hWRes, &max_error); // collect performance data if(argus.timing && hot_calls > 0) latrd_getPerfData(handle, uplo, n, k, dA, lda, dE, dTau, dW, ldw, hA, hE, hTau, hW, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using k*n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, k * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "k", "lda", "ldw"); rocsolver_bench_output(uploC, n, k, lda, ldw); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LATRD(...) extern template void testing_latrd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LATRD, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lauum.cpp000066400000000000000000000032441503202240500245200ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_lauum.hpp" #define TESTING_LAUUM(...) template void testing_lauum<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_LAUUM, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_lauum.hpp000066400000000000000000000246661503202240500245400ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void lauum_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T A, const rocblas_int lda) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_lauum(nullptr, uplo, n, A, lda), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, rocblas_fill_full, n, A, lda), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, (T) nullptr, lda), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, 0, (T) nullptr, lda), rocblas_status_success); } template void testing_lauum_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments lauum_checkBadArgs(handle, uplo, n, dA.data(), lda); } template void lauum_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA) { if(CPU) { rocblas_init(hA, true); // LAPACK intends that lauum only be called on matrices with a real diagonal for(int i = 0; i < n; i++) { hA[0][i + i * lda] = std::real(hA[0][i + i * lda]); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void lauum_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA, Th& hAr, double* max_err) { // initialize data lauum_initData(handle, uplo, n, dA, lda, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_lauum(handle, uplo, n, dA.data(), lda)); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_lauum(uplo, n, hA[0], lda); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius *max_err = norm_error('F', n, n, lda, hA[0], hAr[0]); } template void lauum_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Th& hA, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { lauum_initData(handle, uplo, n, dA, lda, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_lauum(uplo, n, hA[0], lda); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } lauum_initData(handle, uplo, n, dA, lda, hA); // cold calls for(int iter = 0; iter < 2; iter++) { lauum_initData(handle, uplo, n, dA, lda, hA); CHECK_ROCBLAS_ERROR(rocsolver_lauum(handle, uplo, n, dA.data(), lda)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { lauum_initData(handle, uplo, n, dA, lda, hA); start = get_time_us_sync(stream); rocsolver_lauum(handle, uplo, n, dA.data(), lda); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_lauum(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int hot_calls = argus.iters; rocblas_fill uplo = char2rocblas_fill(uploC); // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, (T*)nullptr, lda), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(n) * lda; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, (T*)nullptr, lda), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_lauum(handle, uplo, n, (T*)nullptr, lda)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_A, 1, size_A, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_lauum(handle, uplo, n, dA.data(), lda), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) lauum_getError(handle, uplo, n, dA, lda, hA, hAr, &max_error); // collect performance data if(argus.timing) lauum_getPerfData(handle, uplo, n, dA, lda, hA, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using machine precision for tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 1); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_LAUUM(...) extern template void testing_lauum<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_LAUUM, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgbr_ungbr.cpp000066400000000000000000000032741503202240500257100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_orgbr_ungbr.hpp" #define TESTING_ORGBR_UNGBR(...) template void testing_orgbr_ungbr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGBR_UNGBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgbr_ungbr.hpp000066400000000000000000000353771503202240500257260ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void orgbr_ungbr_checkBadArgs(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(nullptr, storev, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(handle, rocblas_storev(0), m, n, k, dA, lda, dIpiv), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(handle, storev, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, rocblas_row_wise, 0, n, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, rocblas_column_wise, m, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgbr_ungbr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_storev storev = rocblas_column_wise; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgbr_ungbr_checkBadArgs(handle, storev, m, n, k, dA.data(), lda, dIpiv.data()); } template void orgbr_ungbr_initData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); size_t s = std::max(hIpiv.n(), int64_t(2)); std::vector E(s - 1); std::vector D(s); std::vector P(s); rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities // and compute gebrd if(storev == rocblas_column_wise) { for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(m, k, hA[0], lda, D.data(), E.data(), hIpiv[0], P.data(), hW.data(), size_W); } else { for(int i = 0; i < k; ++i) { for(int j = 0; j < n; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(k, n, hA[0], lda, D.data(), E.data(), P.data(), hIpiv[0], hW.data(), size_W); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgbr_ungbr_getError(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); // initialize data orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_orgbr_ungbr(storev, m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orgbr_ungbr_getPerfData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); if(!perf) { orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_orgbr_ungbr(storev, m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR( rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgbr_ungbr_initData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgbr_ungbr(Arguments& argus) { // get arguments rocblas_local_handle handle; char storevC = argus.get("storev"); rocblas_int m, n; if(storevC == 'R') { m = argus.get("m"); n = argus.get("n", m); } else { n = argus.get("n"); m = argus.get("m", n); } rocblas_int k = argus.get("k", std::min(m, n)); rocblas_int lda = argus.get("lda", m); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes // size_P could be zero in test cases that are not quick-return or invalid // cases setting it to one to avoid possible memory access errors in the rest // of the unit test bool row = (storev == rocblas_row_wise); size_t size_A = row ? size_t(lda) * n : size_t(lda) * std::max(n, k); size_t size_P = row ? std::max(size_t(std::min(n, k)), size_t(1)) : std::max(size_t(std::min(m, k)), size_t(1)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || lda < m) || (row && (m > n || m < std::min(n, k))) || (!row && (n > m || n < std::min(m, k)))); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, storev, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY( rocsolver_orgbr_ungbr(handle, storev, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orgbr_ungbr(handle, storev, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgbr_ungbr_getError(handle, storev, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgbr_ungbr_getPerfData(handle, storev, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = row ? n : m; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("storev", "m", "n", "k", "lda"); rocsolver_bench_output(storevC, m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGBR_UNGBR(...) \ extern template void testing_orgbr_ungbr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGBR_UNGBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orglx_unglx.cpp000066400000000000000000000033251503202240500257450ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_orglx_unglx.hpp" #define TESTING_ORGLX_UNGLX(...) template void testing_orglx_unglx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGLX_UNGLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orglx_unglx.hpp000066400000000000000000000313131503202240500257500ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void orglx_unglx_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, nullptr, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, 0, n, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_orglx_unglx(GLQ, handle, 0, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orglx_unglx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orglx_unglx_checkBadArgs(handle, m, n, k, dA.data(), lda, dIpiv.data()); } template void orglx_unglx_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute LQ factorization cpu_gelqf(m, n, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orglx_unglx_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = size_t(m); std::vector hW(size_W); // initialize data orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack GLQ ? cpu_orglq_unglq(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_orgl2_ungl2(m, n, k, hA[0], lda, hIpiv[0], hW.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orglx_unglx_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = size_t(m); std::vector hW(size_W); if(!perf) { orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); GLQ ? cpu_orglq_unglq(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_orgl2_ungl2(m, n, k, hA[0], lda, hIpiv[0], hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orglx_unglx_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orglx_unglx(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int k = argus.get("k", m); rocblas_int lda = argus.get("lda", m); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(m); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 0 || lda < m || n < m || k > m); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orglx_unglx(GLQ, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orglx_unglx(GLQ, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orglx_unglx(GLQ, handle, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orglx_unglx_getError(handle, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orglx_unglx_getPerfData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "k", "lda"); rocsolver_bench_output(m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGLX_UNGLX(...) \ extern template void testing_orglx_unglx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGLX_UNGLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgtr_ungtr.cpp000066400000000000000000000032741503202240500257540ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_orgtr_ungtr.hpp" #define TESTING_ORGTR_UNGTR(...) template void testing_orgtr_ungtr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGTR_UNGTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgtr_ungtr.hpp000066400000000000000000000310241503202240500257530ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void orgtr_ungtr_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(nullptr, uplo, n, dA, lda, dIpiv), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, rocblas_fill(0), n, dA, lda, dIpiv), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgtr_ungtr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 2; rocblas_int lda = 2; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgtr_ungtr_checkBadArgs(handle, uplo, n, dA.data(), lda, dIpiv.data()); } template void orgtr_ungtr_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); size_t s = std::max(hIpiv.n(), int64_t(2)); std::vector E(s - 1); std::vector D(s); rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < n; ++i) { for(int j = 0; j < n; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute sytrd/hetrd cpu_sytrd_hetrd(uplo, n, hA[0], lda, D.data(), E.data(), hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgtr_ungtr_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = n * 32; std::vector hW(size_W); // initialize data orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack cpu_orgtr_ungtr(uplo, n, hA[0], lda, hIpiv[0], hW.data(), size_W); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', n, n, lda, hA[0], hAr[0]); } template void orgtr_ungtr_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = n * 32; std::vector hW(size_W); if(!perf) { orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_orgtr_ungtr(uplo, n, hA[0], lda, hIpiv[0], hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgtr_ungtr_initData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgtr_ungtr(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes // size_P could be zero in test cases that are not quick-return or invalid // cases setting it to one to avoid possible memory access errors in the rest // of the unit test size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orgtr_ungtr(handle, uplo, n, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_orgtr_ungtr(handle, uplo, n, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgtr_ungtr_getError(handle, uplo, n, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgtr_ungtr_getPerfData(handle, uplo, n, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGTR_UNGTR(...) \ extern template void testing_orgtr_ungtr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGTR_UNGTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgxl_ungxl.cpp000066400000000000000000000033251503202240500257450ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_orgxl_ungxl.hpp" #define TESTING_ORGXL_UNGXL(...) template void testing_orgxl_ungxl<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGXL_UNGXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgxl_ungxl.hpp000066400000000000000000000313231503202240500257510ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void orgxl_ungxl_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, nullptr, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, m, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_orgxl_ungxl(GQL, handle, 0, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgxl_ungxl_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgxl_ungxl_checkBadArgs(handle, m, n, k, dA.data(), lda, dIpiv.data()); } template void orgxl_ungxl_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(m - i == n - j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QL factorization cpu_geqlf(m, n, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgxl_ungxl_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = size_t(n); std::vector hW(size_W); // initialize data orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack GQL ? cpu_orgql_ungql(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2l_ung2l(m, n, k, hA[0], lda, hIpiv[0], hW.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orgxl_ungxl_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = size_t(n); std::vector hW(size_W); if(!perf) { orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); GQL ? cpu_orgql_ungql(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2l_ung2l(m, n, k, hA[0], lda, hIpiv[0], hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgxl_ungxl_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgxl_ungxl(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int m = argus.get("m", n); rocblas_int k = argus.get("k", n); rocblas_int lda = argus.get("lda", m); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(m); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 0 || lda < m || m < n || k > n); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxl_ungxl(GQL, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orgxl_ungxl(GQL, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxl_ungxl(GQL, handle, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgxl_ungxl_getError(handle, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgxl_ungxl_getPerfData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "k", "lda"); rocsolver_bench_output(m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGXL_UNGXL(...) \ extern template void testing_orgxl_ungxl<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGXL_UNGXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgxr_ungxr.cpp000066400000000000000000000033251503202240500257610ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_orgxr_ungxr.hpp" #define TESTING_ORGXR_UNGXR(...) template void testing_orgxr_ungxr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORGXR_UNGXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_orgxr_ungxr.hpp000066400000000000000000000313131503202240500257640ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void orgxr_ungxr_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, nullptr, m, n, k, dA, lda, dIpiv), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, (T) nullptr, lda, dIpiv), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA, lda, (T) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, 0, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_orgxr_ungxr(GQR, handle, m, 0, 0, (T) nullptr, lda, (T) nullptr), rocblas_status_success); } template void testing_orgxr_ungxr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments orgxr_ungxr_checkBadArgs(handle, m, n, k, dA.data(), lda, dIpiv.data()); } template void orgxr_ungxr_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, std::vector& hW, size_t size_W) { if(CPU) { rocblas_init(hA, true); rocblas_init(hIpiv, true); // scale to avoid singularities for(int i = 0; i < m; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QR factorization cpu_geqrf(m, n, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void orgxr_ungxr_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hAr, Th& hIpiv, double* max_err) { size_t size_W = size_t(n); std::vector hW(size_W); // initialize data orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data())); CHECK_HIP_ERROR(hAr.transfer_from(dA)); // CPU lapack GQR ? cpu_orgqr_ungqr(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2r_ung2r(m, n, k, hA[0], lda, hIpiv[0], hW.data()); // error is ||hA - hAr|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, lda, hA[0], hAr[0]); } template void orgxr_ungxr_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Th& hA, Th& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = size_t(n); std::vector hW(size_W); if(!perf) { orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); GQR ? cpu_orgqr_ungqr(m, n, k, hA[0], lda, hIpiv[0], hW.data(), size_W) : cpu_org2r_ung2r(m, n, k, hA[0], lda, hIpiv[0], hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { orgxr_ungxr_initData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, hW, size_W); start = get_time_us_sync(stream); rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_orgxr_ungxr(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int m = argus.get("m", n); rocblas_int k = argus.get("k", n); rocblas_int lda = argus.get("lda", m); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Ar = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || k < 0 || lda < m || n > m || k > n); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxr_ungxr(GQR, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_orgxr_ungxr(GQR, handle, m, n, k, (T*)nullptr, lda, (T*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hAr(size_Ar, 1, size_Ar, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_orgxr_ungxr(GQR, handle, m, n, k, dA.data(), lda, dIpiv.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) orgxr_ungxr_getError(handle, m, n, k, dA, lda, dIpiv, hA, hAr, hIpiv, &max_error); // collect performance data if(argus.timing) orgxr_ungxr_getPerfData(handle, m, n, k, dA, lda, dIpiv, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using m * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("m", "n", "k", "lda"); rocsolver_bench_output(m, n, k, lda); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORGXR_UNGXR(...) \ extern template void testing_orgxr_ungxr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORGXR_UNGXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormbr_unmbr.cpp000066400000000000000000000032741503202240500257240ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_ormbr_unmbr.hpp" #define TESTING_ORMBR_UNMBR(...) template void testing_ormbr_unmbr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMBR_UNMBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormbr_unmbr.hpp000066400000000000000000000470771503202240500257420ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void ormbr_unmbr_checkBadArgs(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormbr_unmbr(nullptr, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, rocblas_storev(0), side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side_left, trans, 0, n, k, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side_right, trans, m, 0, k, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormbr_unmbr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_storev storev = rocblas_column_wise; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormbr_unmbr_checkBadArgs(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormbr_unmbr_initData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); size_t s = std::max(hIpiv.n(), int64_t(2)); std::vector E(s - 1); std::vector D(s); std::vector P(s); rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities // and compute gebrd if(storev == rocblas_column_wise) { for(int i = 0; i < nq; ++i) { for(int j = 0; j < s; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(nq, s, hA[0], lda, D.data(), E.data(), hIpiv[0], P.data(), hW.data(), size_W); } else { for(int i = 0; i < s; ++i) { for(int j = 0; j < nq; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } cpu_gebrd(s, nq, hA[0], lda, D.data(), E.data(), P.data(), hIpiv[0], hW.data(), size_W); } } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormbr_unmbr_getError(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); // initialize data ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack cpu_ormbr_unmbr(storev, side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormbr_unmbr_getPerfData(const rocblas_handle handle, const rocblas_storev storev, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); if(!perf) { ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_ormbr_unmbr(storev, side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormbr_unmbr_initData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormbr_unmbr(Arguments& argus) { // get arguments rocblas_local_handle handle; char storevC = argus.get("storev"); char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); } else { n = argus.get("n"); m = argus.get("m", n); } rocblas_int k = argus.get("k", std::min(m, n)); rocblas_int nq = (sideC == 'L' ? m : n); rocblas_int lda = argus.get("lda", storevC == 'C' ? nq : std::min(nq, k)); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_storev storev = char2rocblas_storev(storevC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_P = size_t(std::min(nq, k)); size_t size_C = size_t(ldc) * n; bool row = (storev == rocblas_row_wise); size_t size_A = row ? size_t(lda) * nq : size_t(lda) * size_P; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m) || (row && lda < std::min(nq, k)) || (!row && lda < nq)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormbr_unmbr(handle, storev, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormbr_unmbr_getError(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormbr_unmbr_getPerfData(handle, storev, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("storev", "side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(storevC, sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMBR_UNMBR(...) \ extern template void testing_ormbr_unmbr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMBR_UNMBR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormlx_unmlx.cpp000066400000000000000000000033251503202240500257610ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_ormlx_unmlx.hpp" #define TESTING_ORMLX_UNMLX(...) template void testing_ormlx_unmlx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMLX_UNMLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormlx_unmlx.hpp000066400000000000000000000441411503202240500257670ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void ormlx_unmlx_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, nullptr, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side_right, trans, 0, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side_left, trans, m, 0, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormlx_unmlx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormlx_unmlx_checkBadArgs(handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormlx_unmlx_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < k; ++i) { for(int j = 0; j < nq; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute LQ factorization cpu_gelqf(k, nq, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormlx_unmlx_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); // initialize data ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack MLQ ? cpu_ormlq_unmlq(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orml2_unml2(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormlx_unmlx_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); if(!perf) { ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); MLQ ? cpu_ormlq_unmlq(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orml2_unml2(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormlx_unmlx_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormlx_unmlx(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n, k; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); k = argus.get("k", m); } else { n = argus.get("n"); m = argus.get("m", n); k = argus.get("k", n); } rocblas_int lda = argus.get("lda", k); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = left ? size_t(lda) * m : size_t(lda) * n; size_t size_P = size_t(k); size_t size_C = size_t(ldc) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m || lda < k) || (left && k > m) || (!left && k > n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormlx_unmlx(MLQ, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormlx_unmlx_getError(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormlx_unmlx_getPerfData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMLX_UNMLX(...) \ extern template void testing_ormlx_unmlx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMLX_UNMLX, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormtr_unmtr.cpp000066400000000000000000000032741503202240500257700ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_ormtr_unmtr.hpp" #define TESTING_ORMTR_UNMTR(...) template void testing_ormtr_unmtr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMTR_UNMTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormtr_unmtr.hpp000066400000000000000000000435071503202240500260000ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void ormtr_unmtr_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(nullptr, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, rocblas_side(0), uplo, trans, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, rocblas_fill(0), trans, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, rocblas_operation(0), m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, rocblas_operation_transpose, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, rocblas_operation_conjugate_transpose, m, n, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, rocblas_side_left, uplo, trans, 0, n, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, rocblas_side_right, uplo, trans, m, 0, (T) nullptr, lda, (T) nullptr, (T) nullptr, ldc), rocblas_status_success); } template > void testing_ormtr_unmtr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_fill uplo = rocblas_fill_upper; rocblas_operation trans = rocblas_operation_none; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldc = 2; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormtr_unmtr_checkBadArgs(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormtr_unmtr_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { using S = decltype(std::real(T{})); rocblas_int nq = (side == rocblas_side_left) ? m : n; std::vector E(nq - 1); std::vector D(nq); rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < nq; ++i) { for(int j = 0; j < nq; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute sytrd/hetrd cpu_sytrd_hetrd(uplo, nq, hA[0], lda, D.data(), E.data(), hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormtr_unmtr_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = (side == rocblas_side_left ? m : n) * 32; std::vector hW(size_W); // initialize data ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack cpu_ormtr_unmtr(side, uplo, trans, m, n, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormtr_unmtr_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_fill uplo, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = (side == rocblas_side_left ? m : n) * 32; std::vector hW(size_W); if(!perf) { ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_ormtr_unmtr(side, uplo, trans, m, n, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormtr_unmtr_initData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormtr_unmtr(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char uploC = argus.get("uplo"); char transC = argus.get("trans"); rocblas_int m, n; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); } else { n = argus.get("n"); m = argus.get("m", n); } rocblas_int nq = (sideC == 'L' ? m : n); rocblas_int lda = argus.get("lda", nq); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_P = size_t(nq); size_t size_C = size_t(ldc) * n; size_t size_A = size_t(lda) * nq; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || ldc < m || lda < nq); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormtr_unmtr(handle, side, uplo, trans, m, n, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormtr_unmtr_getError(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormtr_unmtr_getPerfData(handle, side, uplo, trans, m, n, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "uplo", "trans", "m", "n", "lda", "ldc"); rocsolver_bench_output(sideC, uploC, transC, m, n, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMTR_UNMTR(...) \ extern template void testing_ormtr_unmtr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMTR_UNMTR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormxl_unmxl.cpp000066400000000000000000000033251503202240500257610ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_ormxl_unmxl.hpp" #define TESTING_ORMXL_UNMXL(...) template void testing_ormxl_unmxl<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMXL_UNMXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormxl_unmxl.hpp000066400000000000000000000442241503202240500257710ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void ormxl_unmxl_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, nullptr, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side_right, trans, 0, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side_left, trans, m, 0, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormxl_unmxl_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormxl_unmxl_checkBadArgs(handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormxl_unmxl_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < nq; ++i) { for(int j = 0; j < k; ++j) { if(m - i == n - j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QL factorization cpu_geqlf(nq, k, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormxl_unmxl_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); // initialize data ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack MQL ? cpu_ormql_unmql(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2l_unm2l(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormxl_unmxl_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); if(!perf) { ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); MQL ? cpu_ormql_unmql(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2l_unm2l(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormxl_unmxl_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormxl_unmxl(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n, k; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); k = argus.get("k", m); } else { n = argus.get("n"); m = argus.get("m", n); k = argus.get("k", n); } rocblas_int lda = argus.get("lda", sideC == 'L' ? m : n); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = size_t(lda) * k; size_t size_P = size_t(k); size_t size_C = size_t(ldc) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m) || (left && lda < m) || (!left && lda < n) || (left && k > m) || (!left && k > n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxl_unmxl(MQL, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormxl_unmxl_getError(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormxl_unmxl_getPerfData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMXL_UNMXL(...) \ extern template void testing_ormxl_unmxl<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMXL_UNMXL, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormxr_unmxr.cpp000066400000000000000000000033251503202240500257750ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_ormxr_unmxr.hpp" #define TESTING_ORMXR_UNMXR(...) template void testing_ormxr_unmxr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_ORMXR_UNMXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_ormxr_unmxr.hpp000066400000000000000000000441731503202240500260100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void ormxr_unmxr_checkBadArgs(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, T dA, const rocblas_int lda, T dIpiv, T dC, const rocblas_int ldc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, nullptr, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side(0), trans, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, rocblas_operation(0), m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); if(COMPLEX) EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, rocblas_operation_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, rocblas_operation_conjugate_transpose, m, n, k, dA, lda, dIpiv, dC, ldc), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T) nullptr, lda, dIpiv, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA, lda, (T) nullptr, dC, ldc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side_right, trans, 0, n, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side_left, trans, m, 0, k, dA, lda, dIpiv, (T) nullptr, ldc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, rocblas_side_left, trans, m, n, 0, (T) nullptr, lda, (T) nullptr, dC, ldc), rocblas_status_success); } template > void testing_ormxr_unmxr_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_side side = rocblas_side_left; rocblas_operation trans = rocblas_operation_none; rocblas_int k = 1; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; // memory allocation device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); // check bad arguments ormxr_unmxr_checkBadArgs(handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); } template void ormxr_unmxr_initData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, std::vector& hW, size_t size_W) { if(CPU) { rocblas_int nq = (side == rocblas_side_left) ? m : n; rocblas_init(hA, true); rocblas_init(hIpiv, true); rocblas_init(hC, true); // scale to avoid singularities for(int i = 0; i < nq; ++i) { for(int j = 0; j < k; ++j) { if(i == j) hA[0][i + j * lda] += 400; else hA[0][i + j * lda] -= 4; } } // compute QR factorization cpu_geqrf(nq, k, hA[0], lda, hIpiv[0], hW.data(), size_W); } if(GPU) { // copy data from CPU to device CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void ormxr_unmxr_getError(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, Th& hCr, double* max_err) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); // initialize data ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); CHECK_HIP_ERROR(hCr.transfer_from(dC)); // CPU lapack MQR ? cpu_ormqr_unmqr(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2r_unm2r(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); // error is ||hC - hCr|| / ||hC|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm *max_err = norm_error('F', m, n, ldc, hC[0], hCr[0]); } template void ormxr_unmxr_getPerfData(const rocblas_handle handle, const rocblas_side side, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int k, Td& dA, const rocblas_int lda, Td& dIpiv, Td& dC, const rocblas_int ldc, Th& hA, Th& hIpiv, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { size_t size_W = std::max(std::max(m, n), k); std::vector hW(size_W); if(!perf) { ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); MQR ? cpu_ormqr_unmqr(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data(), size_W) : cpu_orm2r_unm2r(side, trans, m, n, k, hA[0], lda, hIpiv[0], hC[0], ldc, hW.data()); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); // cold calls for(int iter = 0; iter < 2; iter++) { ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); CHECK_ROCBLAS_ERROR(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { ormxr_unmxr_initData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hW, size_W); start = get_time_us_sync(stream); rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_ormxr_unmxr(Arguments& argus) { // get arguments rocblas_local_handle handle; char sideC = argus.get("side"); char transC = argus.get("trans"); rocblas_int m, n, k; if(sideC == 'L') { m = argus.get("m"); n = argus.get("n", m); k = argus.get("k", m); } else { n = argus.get("n"); m = argus.get("m", n); k = argus.get("k", n); } rocblas_int lda = argus.get("lda", sideC == 'L' ? m : n); rocblas_int ldc = argus.get("ldc", m); rocblas_side side = char2rocblas_side(sideC); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int hot_calls = argus.iters; // check non-supported values bool invalid_value = (side == rocblas_side_both || (COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes bool left = (side == rocblas_side_left); size_t size_A = size_t(lda) * k; size_t size_P = size_t(k); size_t size_C = size_t(ldc) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_Cr = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = ((m < 0 || n < 0 || k < 0 || ldc < m) || (left && (lda < m || k > m)) || (!left && (lda < n || k > n))); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, (T*)nullptr, lda, (T*)nullptr, (T*)nullptr, ldc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCr(size_Cr, 1, size_Cr, 1); host_strided_batch_vector hIpiv(size_P, 1, size_P, 1); host_strided_batch_vector hA(size_A, 1, size_A, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dIpiv(size_P, 1, size_P, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); // check quick return if(n == 0 || m == 0 || k == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_ormxr_unmxr(MQR, handle, side, trans, m, n, k, dA.data(), lda, dIpiv.data(), dC.data(), ldc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) ormxr_unmxr_getError(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, hCr, &max_error); // collect performance data if(argus.timing) ormxr_unmxr_getPerfData(handle, side, trans, m, n, k, dA, lda, dIpiv, dC, ldc, hA, hIpiv, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using s * machine_precision as tolerance rocblas_int s = left ? m : n; if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, s); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("side", "trans", "m", "n", "k", "lda", "ldc"); rocsolver_bench_output(sideC, transC, m, n, k, lda, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_ORMXR_UNMXR(...) \ extern template void testing_ormxr_unmxr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_ORMXR_UNMXR, FOREACH_SCALAR_TYPE, FOREACH_BLOCKED_VARIANT, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stebz.cpp000066400000000000000000000032421503202240500245220ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_stebz.hpp" #define TESTING_STEBZ(...) template void testing_stebz<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEBZ, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stebz.hpp000066400000000000000000000551551503202240500245410ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void stebz_checkBadArgs(const rocblas_handle handle, const rocblas_erange erange, const rocblas_eorder eorder, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, const T abstol, U dD, U dE, rocblas_int* dnev, rocblas_int* dnsplit, U dW, rocblas_int* dIblock, rocblas_int* dIsplit, rocblas_int* dinfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_stebz(nullptr, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, rocblas_erange(0), eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, rocblas_eorder(0), n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, (U) nullptr, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, (U) nullptr, dnev, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, (rocblas_int*)nullptr, dnsplit, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, (rocblas_int*)nullptr, dW, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, (U) nullptr, dIblock, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, (rocblas_int*)nullptr, dIsplit, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, (rocblas_int*)nullptr, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, 0, vl, vu, il, iu, abstol, (U) nullptr, (U) nullptr, dnev, dnsplit, (U) nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, dinfo), rocblas_status_success); } template void testing_stebz_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 2; rocblas_erange erange = rocblas_erange_all; rocblas_eorder eorder = rocblas_eorder_entire; T vl = 0; T vu = 0; rocblas_int il = 0; rocblas_int iu = 0; T abstol = 0; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dnev(1, 1, 1, 1); device_strided_batch_vector dnsplit(1, 1, 1, 1); device_strided_batch_vector dIblock(1, 1, 1, 1); device_strided_batch_vector dIsplit(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dnev.memcheck()); CHECK_HIP_ERROR(dnsplit.memcheck()); CHECK_HIP_ERROR(dIblock.memcheck()); CHECK_HIP_ERROR(dIsplit.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments stebz_checkBadArgs(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data()); } template void stebz_initData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Th& hD, Th& hE) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add fixed splits in the matrix to test split handling // (scaling ensures that all eigenvalues are in [-20, 20]) for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] = (hE[0][i] - 5) / 10; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); } } template void stebz_getError(const rocblas_handle handle, const rocblas_erange erange, const rocblas_eorder eorder, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, const T abstol, Td& dD, Td& dE, Ud& dnev, Ud& dnsplit, Td& dW, Ud& dIblock, Ud& dIsplit, Ud& dinfo, Th& hD, Th& hE, Uh& hnev, Uh& hnevRes, Uh& hnsplit, Uh& hnsplitRes, Th& hW, Th& hWRes, Uh& hIblock, Uh& hIblockRes, Uh& hIsplit, Uh& hIsplitRes, Uh& hinfo, Uh& hinfoRes, double* max_err) { std::vector work(4 * n); std::vector iwork(3 * n); // input data initialization stebz_initData(handle, n, dD, dE, hD, hE); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data())); CHECK_HIP_ERROR(hnevRes.transfer_from(dnev)); CHECK_HIP_ERROR(hnsplitRes.transfer_from(dnsplit)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hIblockRes.transfer_from(dIblock)); CHECK_HIP_ERROR(hIsplitRes.transfer_from(dIsplit)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin T atol = (abstol == 0) ? 2 * get_safemin() : abstol; cpu_stebz(erange, eorder, n, vl, vu, il, iu, atol, hD[0], hE[0], hnev[0], hnsplit[0], hW[0], hIblock[0], hIsplit[0], work.data(), iwork.data(), hinfo[0]); // check info EXPECT_EQ(hinfo[0][0], hinfoRes[0][0]); if(hinfo[0][0] != hinfoRes[0][0]) *max_err = 1; else *max_err = 0; // check number of split blocks rocblas_int ns = hnsplit[0][0]; *max_err += std::abs(ns - hnsplitRes[0][0]); EXPECT_EQ(hnsplit[0][0], hnsplitRes[0][0]); // check split blocks limits for(int k = 0; k < ns; ++k) { *max_err += std::abs(hIsplit[0][k] - hIsplitRes[0][k]); EXPECT_EQ(hIsplit[0][k], hIsplitRes[0][k]) << "where k = " << k; } // if finding eigenvalues succeded, check values if(hinfo[0][0] == 0) { // check number of computed eigenvalues rocblas_int nn = hnev[0][0]; *max_err += std::abs(nn - hnevRes[0][0]); EXPECT_EQ(hnev[0][0], hnevRes[0][0]); // check block indices // (note: as very close eigenvalues could be considered to belong to different // blocks by the CPU and GPU algorithms, only check the block index of distinguishable // eigenvalues) for(int k = 0; k < nn; ++k) { int difb = std::abs(hIblock[0][k] - hIblockRes[0][k]); T difv = std::abs(hW[0][k] - hWRes[0][k]) / hW[0][k]; if(difv > n * get_epsilon()) { EXPECT_EQ(hIblock[0][k], hIblockRes[0][k]) << "where k = " << k; if(difb > 0) *max_err += difb; } } // error is ||hW - hWRes|| / ||hW|| // using frobenius norm double err = norm_error('F', 1, nn, 1, hW[0], hWRes[0]); *max_err = err > *max_err ? err : *max_err; } } template void stebz_getPerfData(const rocblas_handle handle, const rocblas_erange erange, const rocblas_eorder eorder, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, const T abstol, Td& dD, Td& dE, Ud& dnev, Ud& dnsplit, Td& dW, Ud& dIblock, Ud& dIsplit, Ud& dinfo, Th& hD, Th& hE, Uh& hnev, Uh& hnsplit, Th& hW, Uh& hIblock, Uh& hIsplit, Uh& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { std::vector work(4 * n); std::vector iwork(3 * n); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin T atol = (abstol == 0) ? 2 * get_safemin() : abstol; stebz_initData(handle, n, dD, dE, hD, hE); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_stebz(erange, eorder, n, vl, vu, il, iu, atol, hD[0], hE[0], hnev[0], hnsplit[0], hW[0], hIblock[0], hIsplit[0], work.data(), iwork.data(), hinfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } stebz_initData(handle, n, dD, dE, hD, hE); // cold calls for(int iter = 0; iter < 2; iter++) { stebz_initData(handle, n, dD, dE, hD, hE); CHECK_ROCBLAS_ERROR(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stebz_initData(handle, n, dD, dE, hD, hE); start = get_time_us_sync(stream); rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stebz(Arguments& argus) { // get arguments rocblas_local_handle handle; char erangeC = argus.get("erange"); char eorderC = argus.get("eorder"); rocblas_int n = argus.get("n"); T vl = T(argus.get("vl", 0)); T vu = T(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); T abstol = T(argus.get("abstol")); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_eorder eorder = char2rocblas_eorder(eorderC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_W = n; size_t size_iblock = n; size_t size_isplit = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; size_t size_iblockRes = (argus.unit_check || argus.norm_check) ? size_iblock : 0; size_t size_isplitRes = (argus.unit_check || argus.norm_check) ? size_isplit : 0; // check invalid sizes bool invalid_size = (n < 0) || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu))) || (erange == rocblas_erange_index && (il < 1 || iu < 0)); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hWRes(size_WRes, 1, size_WRes, 1); host_strided_batch_vector hIblock(size_iblock, 1, size_iblock, 1); host_strided_batch_vector hIblockRes(size_iblockRes, 1, size_iblockRes, 1); host_strided_batch_vector hIsplit(size_isplit, 1, size_isplit, 1); host_strided_batch_vector hIsplitRes(size_isplitRes, 1, size_isplitRes, 1); host_strided_batch_vector hnev(1, 1, 1, 1); host_strided_batch_vector hnevRes(1, 1, 1, 1); host_strided_batch_vector hnsplit(1, 1, 1, 1); host_strided_batch_vector hnsplitRes(1, 1, 1, 1); host_strided_batch_vector hinfo(1, 1, 1, 1); host_strided_batch_vector hinfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); device_strided_batch_vector dIblock(size_iblock, 1, size_iblock, 1); device_strided_batch_vector dIsplit(size_isplit, 1, size_isplit, 1); device_strided_batch_vector dnev(1, 1, 1, 1); device_strided_batch_vector dnsplit(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_iblock) CHECK_HIP_ERROR(dIblock.memcheck()); if(size_isplit) CHECK_HIP_ERROR(dIsplit.memcheck()); CHECK_HIP_ERROR(dnev.memcheck()); CHECK_HIP_ERROR(dnsplit.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_stebz(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD.data(), dE.data(), dnev.data(), dnsplit.data(), dW.data(), dIblock.data(), dIsplit.data(), dinfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stebz_getError(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo, hD, hE, hnev, hnevRes, hnsplit, hnsplitRes, hW, hWRes, hIblock, hIblockRes, hIsplit, hIsplitRes, hinfo, hinfoRes, &max_error); // collect performance data if(argus.timing) stebz_getPerfData(handle, erange, eorder, n, vl, vu, il, iu, abstol, dD, dE, dnev, dnsplit, dW, dIblock, dIsplit, dinfo, hD, hE, hnev, hnsplit, hW, hIblock, hIsplit, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("erange", "eorder", "n", "vl", "vu", "il", "iu", "abstol"); rocsolver_bench_output(erangeC, eorderC, n, vl, vu, il, iu, abstol); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEBZ(...) extern template void testing_stebz<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEBZ, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stedc.cpp000066400000000000000000000032441503202240500244770ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_stedc.hpp" #define TESTING_STEDC(...) template void testing_stedc<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEDC, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stedc.hpp000066400000000000000000000532261503202240500245110ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void stedc_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, S dD, S dE, T dC, const rocblas_int ldc, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_stedc(nullptr, evect, n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, rocblas_evect(0), n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, (S) nullptr, dE, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, dD, (S) nullptr, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, dD, dE, (T) nullptr, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, dD, dE, dC, ldc, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_stedc(handle, evect, 0, (S) nullptr, (S) nullptr, (T) nullptr, ldc, dInfo), rocblas_status_success); } template void testing_stedc_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_int n = 2; rocblas_int ldc = 2; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments stedc_checkBadArgs(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); } template void stedc_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo) { if(CPU) { using S = decltype(std::real(T{})); // if the matrix is too small (n < 4), simply initialize D and E if(n < 4) { rocblas_init(hD, true); rocblas_init(hE, true); } // otherwise, the marix will be divided in exactly 2 independent blocks, if the size is even, // or 3 if the size is odd. The 2 main independent blocks will have the same eigenvalues. // The last block, when the size is odd, will have eigenvalue equal 1. else { rocblas_int N1 = n / 2; rocblas_int E = n - 2 * N1; // a. initialize the eigenvalues for the uppermost sub-blocks of the main independent blocks. // The second sub-block will have some repeated eigenvalues in order to test the deflation process S d; rocblas_int NN1 = N1 / 2; rocblas_int NN2 = N1 - NN1; rocblas_int s1 = NN1 * NN1; rocblas_int s2 = NN2 * NN2; rocblas_int sw = NN2 * 32; std::vector A1(s1); std::vector A2(s2); for(rocblas_int i = 0; i < NN1; ++i) { for(rocblas_int j = 0; j < NN1; ++j) { if(i == j) { d = (i + 1) / S(NN1); A1[i + i * NN1] = d; A2[i + i * NN2] = (i % 2 == 0) ? d : -d; } else { A1[i + j * NN1] = 0; A2[i + j * NN2] = 0; } } } if(NN2 > NN1) { for(rocblas_int i = 0; i < NN1; ++i) { A2[NN1 + i * NN2] = 0; A2[i + NN1 * NN2] = 0; } A2[NN1 + NN1 * NN2] = 0; } // b. find the corresponding tridiagonal matrices containing the setup eigenvalues of each sub-block // first find random orthogonal matrices Q1 and Q2 Sh Q1(s1, 1, s1, 1); Sh Q2(s2, 1, s2, 1); rocblas_init(Q1, true); rocblas_init(Q2, true); std::vector hW(sw); std::vector ipiv1(NN1); std::vector ipiv2(NN2); cpu_geqrf(NN1, NN1, Q1.data(), NN1, ipiv1.data(), hW.data(), sw); cpu_geqrf(NN2, NN2, Q2.data(), NN2, ipiv2.data(), hW.data(), sw); // now multiply the orthogonal matrices by the diagonals A1 and A2 to hide the eigenvalues cpu_ormqr_unmqr(rocblas_side_left, rocblas_operation_transpose, NN1, NN1, NN1, Q1.data(), NN1, ipiv1.data(), A1.data(), NN1, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_right, rocblas_operation_none, NN1, NN1, NN1, Q1.data(), NN1, ipiv1.data(), A1.data(), NN1, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_left, rocblas_operation_transpose, NN2, NN2, NN2, Q2.data(), NN2, ipiv2.data(), A2.data(), NN2, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_right, rocblas_operation_none, NN2, NN2, NN2, Q2.data(), NN2, ipiv2.data(), A2.data(), NN2, hW.data(), sw); // finally, perform tridiagonalization cpu_sytrd_hetrd(rocblas_fill_upper, NN1, A1.data(), NN1, hD[0], hE[0], ipiv1.data(), hW.data(), sw); cpu_sytrd_hetrd(rocblas_fill_upper, NN2, A2.data(), NN2, hD[0] + NN1, hE[0] + NN1, ipiv2.data(), hW.data(), sw); // c. integrate blocks into final matrix // integrate the 2 sub-blocks into the first independent block hE[0][NN1 - 1] = 1; hD[0][NN1 - 1] += 1; hD[0][NN1] += 1; // copy the independent block over for(rocblas_int i = 0; i < N1; ++i) { hD[0][N1 + i] = hD[0][i]; hE[0][N1 + i] = hE[0][i]; } hE[0][N1 - 1] = 0; hE[0][2 * N1 - 1] = 0; // integrate the 2 sub-blocks into the second independent block // (using negative p to test secular eqn algorithm) hE[0][N1 + NN1 - 1] = -1; hD[0][N1 + NN1 - 1] -= 2; hD[0][N1 + NN1] -= 2; // if there is a third independent block, initialize it with 1 if(E == 1) hD[0][n - 1] = 1; } // initialize C to the identity matrix if(evect == rocblas_evect_original) { for(rocblas_int j = 0; j < n; j++) { for(rocblas_int i = 0; i < n; i++) { if(i == j) hC[0][i + j * ldc] = 1; else hC[0][i + j * ldc] = 0; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void stedc_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err, double* max_errv) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lgn = floor(log(n - 1) / log(2)) + 1; size_t lwork = (COMPLEX) ? n * n : 0; size_t lrwork = (evect == rocblas_evect_none || n <= 1) ? 1 : 1 + 3 * n + 4 * n * n + 2 * n * lgn; size_t liwork = (evect == rocblas_evect_none || n <= 1) ? 1 : 6 + 6 * n + 5 * n * lgn; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); // input data initialization stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hCRes.transfer_from(dC)); // if eigenvectors were required, prepare matrix A (upper triangular) for implicit tests rocblas_int lda = n; size_t size_A = lda * n; host_strided_batch_vector hA(size_A, 1, size_A, 1); if(evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[0][i + j * lda] = hD[0][i]; else if(i + 1 == j) hA[0][i + j * lda] = hE[0][i]; else hA[0][i + j * lda] = 0; } } } // CPU lapack cpu_stedc(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[0]); // check info EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { // check that eigenvalues are correct and in order // error is ||hD - hDRes|| / ||hD|| // using frobenius norm err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // check eigenvectors if required if(evect != rocblas_evect_none) { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[0][j]; cpu_symv_hemv(rocblas_fill_upper, n, alpha, hA[0], lda, hCRes[0] + j * ldc, 1, beta, hC[0] + j * ldc, 1); } // error is ||hC - hCRes|| / ||hC|| // using frobenius norm *max_errv = norm_error('F', n, n, ldc, hCRes[0], hC[0]); } } } template void stedc_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lgn = floor(log(n - 1) / log(2)) + 1; size_t lwork = (COMPLEX) ? n * n : 0; size_t lrwork = (evect == rocblas_evect_none || n <= 1) ? 1 : 1 + 3 * n + 4 * n * n + 2 * n * lgn; size_t liwork = (evect == rocblas_evect_none || n <= 1) ? 1 : 6 + 6 * n + 5 * n * lgn; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); if(!perf) { stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_stedc(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); CHECK_ROCBLAS_ERROR( rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stedc_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); start = get_time_us_sync(stream); rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stedc(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); rocblas_int n = argus.get("n"); rocblas_int ldc = argus.get("ldc", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_C = ldc * n; double max_err = 0, max_errv = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || (evect != rocblas_evect_none && ldc < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_stedc(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stedc(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_stedc(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stedc_getError(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hDRes, hE, hERes, hC, hCRes, hInfo, hInfoRes, &max_err, &max_errv); // collect performance data if(argus.timing) stedc_getPerfData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_err, n); if(evect != rocblas_evect_none) ROCSOLVER_TEST_CHECK(T, max_errv, n * n); } // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("evect", "n", "ldc"); rocsolver_bench_output(evectC, n, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, std::max(max_err, max_errv)); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, std::max(max_err, max_errv)); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEDC(...) extern template void testing_stedc<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEDC, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stedcj.cpp000066400000000000000000000032431503202240500246500ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_stedcj.hpp" #define TESTING_STEDCJ(...) template void testing_stedcj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEDCJ, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stedcj.hpp000066400000000000000000000526131503202240500246620ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void stedcj_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, S dD, S dE, T dC, const rocblas_int ldc, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(nullptr, evect, n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(handle, rocblas_evect(0), n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(handle, evect, n, (S) nullptr, dE, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(handle, evect, n, dD, (S) nullptr, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(handle, evect, n, dD, dE, (T) nullptr, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(handle, evect, n, dD, dE, dC, ldc, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_stedcj(handle, evect, 0, (S) nullptr, (S) nullptr, (T) nullptr, ldc, dInfo), rocblas_status_success); } template void testing_stedcj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_int n = 2; rocblas_int ldc = 2; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments stedcj_checkBadArgs(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); } template void stedcj_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo) { if(CPU) { using S = decltype(std::real(T{})); // if the matrix is too small (n < 4), simply initialize D and E if(n < 4) { rocblas_init(hD, true); rocblas_init(hE, true); } // otherwise, the marix will be divided in exactly 2 independent blocks, if the size is even, // or 3 if the size is odd. The 2 main independent blocks will have the same eigenvalues. // The last block, when the size is odd, will have eigenvalue equal 1. else { rocblas_int N1 = n / 2; rocblas_int E = n - 2 * N1; // a. initialize the eigenvalues for the uppermost sub-blocks of the main independent blocks. // The second sub-block will have some repeated eigenvalues in order to test the deflation process S d; rocblas_int NN1 = N1 / 2; rocblas_int NN2 = N1 - NN1; rocblas_int s1 = NN1 * NN1; rocblas_int s2 = NN2 * NN2; rocblas_int sw = NN2 * 32; std::vector A1(s1); std::vector A2(s2); for(rocblas_int i = 0; i < NN1; ++i) { for(rocblas_int j = 0; j < NN1; ++j) { if(i == j) { d = (i + 1) / S(NN1); A1[i + i * NN1] = d; A2[i + i * NN2] = (i % 2 == 0) ? d : -d; } else { A1[i + j * NN1] = 0; A2[i + j * NN2] = 0; } } } if(NN2 > NN1) { for(rocblas_int i = 0; i < NN1; ++i) { A2[NN1 + i * NN2] = 0; A2[i + NN1 * NN2] = 0; } A2[NN1 + NN1 * NN2] = 0; } // b. find the corresponding tridiagonal matrices containing the setup eigenvalues of each sub-block // first find random orthogonal matrices Q1 and Q2 Sh Q1(s1, 1, s1, 1); Sh Q2(s2, 1, s2, 1); rocblas_init(Q1, true); rocblas_init(Q2, true); std::vector hW(sw); std::vector ipiv1(NN1); std::vector ipiv2(NN2); cpu_geqrf(NN1, NN1, Q1.data(), NN1, ipiv1.data(), hW.data(), sw); cpu_geqrf(NN2, NN2, Q2.data(), NN2, ipiv2.data(), hW.data(), sw); // now multiply the orthogonal matrices by the diagonals A1 and A2 to hide the eigenvalues cpu_ormqr_unmqr(rocblas_side_left, rocblas_operation_transpose, NN1, NN1, NN1, Q1.data(), NN1, ipiv1.data(), A1.data(), NN1, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_right, rocblas_operation_none, NN1, NN1, NN1, Q1.data(), NN1, ipiv1.data(), A1.data(), NN1, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_left, rocblas_operation_transpose, NN2, NN2, NN2, Q2.data(), NN2, ipiv2.data(), A2.data(), NN2, hW.data(), sw); cpu_ormqr_unmqr(rocblas_side_right, rocblas_operation_none, NN2, NN2, NN2, Q2.data(), NN2, ipiv2.data(), A2.data(), NN2, hW.data(), sw); // finally, perform tridiagonalization cpu_sytrd_hetrd(rocblas_fill_upper, NN1, A1.data(), NN1, hD[0], hE[0], ipiv1.data(), hW.data(), sw); cpu_sytrd_hetrd(rocblas_fill_upper, NN2, A2.data(), NN2, hD[0] + NN1, hE[0] + NN1, ipiv2.data(), hW.data(), sw); // c. integrate blocks into final matrix // integrate the 2 sub-blocks into the first independent block hE[0][NN1 - 1] = 1; hD[0][NN1 - 1] += 1; hD[0][NN1] += 1; // copy the independent block over for(rocblas_int i = 0; i < N1; ++i) { hD[0][N1 + i] = hD[0][i]; hE[0][N1 + i] = hE[0][i]; } hE[0][N1 - 1] = 0; hE[0][2 * N1 - 1] = 0; // integrate the 2 sub-blocks into the second independent block // (using negative p to test secular eqn algorithm) hE[0][N1 + NN1 - 1] = -1; hD[0][N1 + NN1 - 1] -= 2; hD[0][N1 + NN1] -= 2; // if there is a third independent block, initialize it with 1 if(E == 1) hD[0][n - 1] = 1; } // initialize C to the identity matrix if(evect == rocblas_evect_original) { for(rocblas_int j = 0; j < n; j++) { for(rocblas_int i = 0; i < n; i++) { if(i == j) hC[0][i + j * ldc] = 1; else hC[0][i + j * ldc] = 0; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void stedcj_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err, double* max_errv) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lgn = floor(log(n - 1) / log(2)) + 1; size_t lwork = (COMPLEX) ? n * n : 0; size_t lrwork = (evect == rocblas_evect_none || n <= 1) ? 1 : 1 + 3 * n + 4 * n * n + 2 * n * lgn; size_t liwork = (evect == rocblas_evect_none || n <= 1) ? 1 : 6 + 6 * n + 5 * n * lgn; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); // input data initialization stedcj_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_stedcj(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hCRes.transfer_from(dC)); // if eigenvectors were required, prepare matrix A (upper triangular) for implicit tests rocblas_int lda = n; size_t size_A = lda * n; host_strided_batch_vector hA(size_A, 1, size_A, 1); if(evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[0][i + j * lda] = hD[0][i]; else if(i + 1 == j) hA[0][i + j * lda] = hE[0][i]; else hA[0][i + j * lda] = 0; } } } // CPU lapack cpu_stedc(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[0]); // check info EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { // check that eigenvalues are correct and in order // error is ||hD - hDRes|| / ||hD|| // using frobenius norm err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // check eigenvectors if required if(evect != rocblas_evect_none) { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[0][j]; cpu_symv_hemv(rocblas_fill_upper, n, alpha, hA[0], lda, hCRes[0] + j * ldc, 1, beta, hC[0] + j * ldc, 1); } // error is ||hC - hCRes|| / ||hC|| // using frobenius norm *max_errv = norm_error('F', n, n, ldc, hCRes[0], hC[0]); } } } template void stedcj_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lgn = floor(log(n - 1) / log(2)) + 1; size_t lwork = (COMPLEX ? n * n : 0); size_t lrwork = (evect == rocblas_evect_none ? 1 : 1 + 3 * n + 4 * n * n + 2 * n * lgn); size_t liwork = (evect == rocblas_evect_none ? 1 : 6 + 6 * n + 5 * n * lgn); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); if(!perf) { // cpu-lapack performance (only if not in perf mode) *cpu_time_used = nan(""); } stedcj_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { stedcj_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); CHECK_ROCBLAS_ERROR( rocsolver_stedcj(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stedcj_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); start = get_time_us_sync(stream); rocsolver_stedcj(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stedcj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); rocblas_int n = argus.get("n"); rocblas_int ldc = argus.get("ldc", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_C = ldc * n; double max_err = 0, max_errv = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || (evect != rocblas_evect_none && ldc < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_stedcj(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stedcj(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_stedcj(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stedcj_getError(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hDRes, hE, hERes, hC, hCRes, hInfo, hInfoRes, &max_err, &max_errv); // collect performance data if(argus.timing) stedcj_getPerfData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_err, n); if(evect != rocblas_evect_none) ROCSOLVER_TEST_CHECK(T, max_errv, n * n); } // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("evect", "n", "ldc"); rocsolver_bench_output(evectC, n, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, std::max(max_err, max_errv)); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, std::max(max_err, max_errv)); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEDCJ(...) extern template void testing_stedcj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEDCJ, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stedcx.cpp000066400000000000000000000032431503202240500246660ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_stedcx.hpp" #define TESTING_STEDCX(...) template void testing_stedcx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEDCX, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stedcx.hpp000066400000000000000000000520041503202240500246720ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void stedcx_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_int n, const T vl, const T vu, const rocblas_int il, const rocblas_int iu, S dD, S dE, rocblas_int* dnev, S dW, U dC, const rocblas_int ldc, rocblas_int* dinfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(nullptr, evect, erange, n, vl, vu, il, iu, dD, dE, dnev, dW, dC, ldc, dinfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, rocblas_erange(0), n, vl, vu, il, iu, dD, dE, dnev, dW, dC, ldc, dinfo), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, rocblas_evect(0), erange, n, vl, vu, il, iu, dD, dE, dnev, dW, dC, ldc, dinfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, (S) nullptr, dE, dnev, dW, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD, (S) nullptr, dnev, dW, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD, dE, (rocblas_int*)nullptr, dW, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD, dE, dnev, (S) nullptr, dC, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD, dE, dnev, dW, (U) nullptr, ldc, dinfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD, dE, dnev, dW, dC, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, 0, vl, vu, il, iu, (S) nullptr, (S) nullptr, dnev, (S) nullptr, (U) nullptr, ldc, dinfo), rocblas_status_success); } template void testing_stedcx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 2; rocblas_int ldc = 2; rocblas_erange erange = rocblas_erange_all; rocblas_evect evect = rocblas_evect_original; S vl = 0; S vu = 0; rocblas_int il = 0; rocblas_int iu = 0; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dnev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dnev.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments stedcx_checkBadArgs(handle, evect, erange, n, vl, vu, il, iu, dD.data(), dE.data(), dnev.data(), dW.data(), dC.data(), ldc, dinfo.data()); } template void stedcx_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Sh& hD, Sh& hE, Th& hC) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add fixed splits in the matrix to test split handling // (scaling ensures that all eigenvalues are in [-20, 20]) for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] = (hE[0][i] - 5) / 10; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } } // initialize C to the identity matrix if(evect == rocblas_evect_original) { for(rocblas_int j = 0; j < n; j++) { for(rocblas_int i = 0; i < n; i++) { if(i == j) hC[0][i + j * ldc] = 1; else hC[0][i + j * ldc] = 0; } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void stedcx_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_int n, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Sd& dD, Sd& dE, Id& dnev, Sd& dW, Td& dC, const rocblas_int ldc, Id& dinfo, Sh& hD, Sh& hE, Ih& hnev, Ih& hnevRes, Sh& hW, Sh& hWRes, Th& hC, Th& hCRes, Ih& hinfo, Ih& hinfoRes, double* max_err) { std::vector work(4 * n); std::vector iwork(3 * n); std::vector hIblock(n); std::vector hIsplit(n); rocblas_int hnsplit; S atol = 2 * get_safemin(); // input data initialization stedcx_initData(handle, evect, n, dD, dE, dC, ldc, hD, hE, hC); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD.data(), dE.data(), dnev.data(), dW.data(), dC.data(), ldc, dinfo.data())); CHECK_HIP_ERROR(hnevRes.transfer_from(dnev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hCRes.transfer_from(dC)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); // CPU lapack cpu_stebz(erange, rocblas_eorder_entire, n, vl, vu, il, iu, atol, hD[0], hE[0], hnev[0], &hnsplit, hW[0], hIblock.data(), hIsplit.data(), work.data(), iwork.data(), hinfo[0]); // check info EXPECT_EQ(hinfo[0][0], hinfoRes[0][0]); if(hinfo[0][0] != hinfoRes[0][0]) *max_err = 1; else *max_err = 0; // if finding eigenvalues succeded, check values if(hinfoRes[0][0] == 0) { // check number of computed eigenvalues rocblas_int nn = hnevRes[0][0]; *max_err += std::abs(nn - hnev[0][0]); EXPECT_EQ(hnev[0][0], hnevRes[0][0]); // error is ||hW - hWRes|| / ||hW|| // using frobenius norm double err = norm_error('F', 1, nn, 1, hW[0], hWRes[0]); *max_err = err > *max_err ? err : *max_err; if(evect != rocblas_evect_none) { // C should be orthonormal, if it is then C^T*C should be the identity if(nn > 0) { std::vector CCres(nn * nn, 0.0); std::vector I(nn * nn, 0.0); for(rocblas_int i = 0; i < nn; i++) I[i + i * nn] = T(1); cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, n, T(1), hCRes[0], ldc, hCRes[0], ldc, T(0), CCres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), CCres.data()); *max_err = err > *max_err ? err : *max_err; } // for each of the nev eigenvalues w_j, verify that the associated eigenvector is in the // null space of (A - w_i * I) T alpha, t1, t2; for(int j = 0; j < nn; j++) { for(int i = 0; i < n; i++) { alpha = hWRes[0][j] - hD[0][i]; hC[0][i + j * ldc] = hCRes[0][i + j * ldc] * alpha; } t1 = hCRes[0][j * ldc]; hCRes[0][j * ldc] = hE[0][0] * hCRes[0][1 + j * ldc]; for(int i = 1; i < n - 1; i++) { t2 = hCRes[0][i + j * ldc]; hCRes[0][i + j * ldc] = hE[0][i - 1] * t1 + hE[0][i] * hCRes[0][(i + 1) + j * ldc]; t1 = t2; } hCRes[0][(n - 1) + j * ldc] = hE[0][n - 2] * t1; } // error is then ||hC - hCRes|| / ||hC|| // using frobenius norm err = norm_error('F', n, nn, ldc, hC[0], hCRes[0]); *max_err = err > *max_err ? err : *max_err; } } } template void stedcx_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_int n, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Sd& dD, Sd& dE, Id& dnev, Sd& dW, Td& dC, const rocblas_int ldc, Id& dinfo, Sh& hD, Sh& hE, Ih& hnev, Sh& hW, Th& hC, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { // cpu-lapack performance (only if not in perf mode) *cpu_time_used = nan(""); } stedcx_initData(handle, evect, n, dD, dE, dC, ldc, hD, hE, hC); // cold calls for(int iter = 0; iter < 2; iter++) { stedcx_initData(handle, evect, n, dD, dE, dC, ldc, hD, hE, hC); CHECK_ROCBLAS_ERROR(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD.data(), dE.data(), dnev.data(), dW.data(), dC.data(), ldc, dinfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stedcx_initData(handle, evect, n, dD, dE, dC, ldc, hD, hE, hC); start = get_time_us_sync(stream); rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD.data(), dE.data(), dnev.data(), dW.data(), dC.data(), ldc, dinfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stedcx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char erangeC = argus.get("erange"); rocblas_int n = argus.get("n"); rocblas_int ldc = argus.get("ldc", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_W = n; size_t size_C = ldc * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0) || (ldc < n) || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu))) || (erange == rocblas_erange_index && (il < 1 || iu < 0)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, (S*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, (S*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hWRes(size_WRes, 1, size_WRes, 1); host_strided_batch_vector hnev(1, 1, 1, 1); host_strided_batch_vector hnevRes(1, 1, 1, 1); host_strided_batch_vector hinfo(1, 1, 1, 1); host_strided_batch_vector hinfoRes(1, 1, 1, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); device_strided_batch_vector dnev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dnev.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_stedcx(handle, evect, erange, n, vl, vu, il, iu, dD.data(), dE.data(), dnev.data(), dW.data(), dC.data(), ldc, dinfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stedcx_getError(handle, evect, erange, n, vl, vu, il, iu, dD, dE, dnev, dW, dC, ldc, dinfo, hD, hE, hnev, hnevRes, hW, hWRes, hC, hCRes, hinfo, hinfoRes, &max_error); // collect performance data if(argus.timing) stedcx_getPerfData(handle, evect, erange, n, vl, vu, il, iu, dD, dE, dnev, dW, dC, ldc, dinfo, hD, hE, hnev, hW, hC, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using 3 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 3 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("evect", "erange", "n", "vl", "vu", "il", "iu", "ldc"); rocsolver_bench_output(evectC, erangeC, n, vl, vu, il, iu, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEDCX(...) extern template void testing_stedcx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEDCX, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stein.cpp000066400000000000000000000032441503202240500245170ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_stein.hpp" #define TESTING_STEIN(...) template void testing_stein<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEIN, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_stein.hpp000066400000000000000000000545261503202240500245350ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void stein_checkBadArgs(const rocblas_handle handle, const rocblas_int n, S dD, S dE, U dNev, S dW, U dIblock, U dIsplit, T dZ, const rocblas_int ldz, U dIfail, U dInfo) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_stein(nullptr, n, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, (S) nullptr, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, (S) nullptr, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, (U) nullptr, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, dNev, (S) nullptr, dIblock, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, dD, dE, dNev, dW, (U) nullptr, dIsplit, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, (U) nullptr, dZ, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, dIsplit, (T) nullptr, ldz, dIfail, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, (U) nullptr, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, 0, (S) nullptr, (S) nullptr, dNev, (S) nullptr, (U) nullptr, (U) nullptr, (T) nullptr, ldz, (U) nullptr, dInfo), rocblas_status_success); } template void testing_stein_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int ldz = 1; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dIblock(1, 1, 1, 1); device_strided_batch_vector dIsplit(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dIblock.memcheck()); CHECK_HIP_ERROR(dIsplit.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments stein_checkBadArgs(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); } template void stein_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nev, Sd& dD, Sd& dE, Ud& dNev, Sd& dW, Ud& dIblock, Ud& dIsplit, Sh& hD, Sh& hE, Uh& hNev, Sh& hW, Uh& hIblock, Uh& hIsplit) { if(CPU) { using S = decltype(std::real(T{})); rocblas_init(hD, true); rocblas_init(hE, true); rocblas_int nsplit, info; size_t lwork = 4 * n; size_t liwork = 3 * n; std::vector work(lwork); std::vector iwork(liwork); // scale matrix for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 10; hE[0][i] -= 5; if(i == n / 4 || i == n / 2 || i == n - 1) hE[0][i] = 0; if(i == n / 7 || i == n / 5 || i == n / 3) hD[0][i] *= -1; } // compute a subset of the eigenvalues S il = n - nev + 1; S iu = n; S abstol = 2 * get_safemin(); cpu_stebz(rocblas_erange_index, rocblas_eorder_blocks, n, S(0), S(0), il, iu, abstol, hD[0], hE[0], hNev[0], &nsplit, hW[0], hIblock[0], hIsplit[0], work.data(), iwork.data(), &info); } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); CHECK_HIP_ERROR(dNev.transfer_from(hNev)); CHECK_HIP_ERROR(dW.transfer_from(hW)); CHECK_HIP_ERROR(dIblock.transfer_from(hIblock)); CHECK_HIP_ERROR(dIsplit.transfer_from(hIsplit)); } } template void stein_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nev, Sd& dD, Sd& dE, Ud& dNev, Sd& dW, Ud& dIblock, Ud& dIsplit, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Sh& hD, Sh& hE, Uh& hNev, Sh& hW, Uh& hIblock, Uh& hIsplit, Th& hZ, Th& hZRes, Uh& hIfail, Uh& hIfailRes, Uh& hInfo, Uh& hInfoRes, double* max_err) { using S = decltype(std::real(T{})); size_t lwork = 5 * n; size_t liwork = n; size_t lifail = n; std::vector work(lwork); std::vector iwork(liwork); // input data initialization stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack cpu_stein(n, hD[0], hE[0], hNev[0], hW[0], hIblock[0], hIsplit[0], hZ[0], ldz, work.data(), iwork.data(), hIfail[0], hInfo[0]); // check info EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { rocblas_int nn = hNev[0][0]; // check ifail err = 0; for(int j = 0; j < nn; j++) { EXPECT_EQ(hIfailRes[0][j], 0) << "j = " << j; if(hIfailRes[0][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; // need to implicitly test eigenvectors due to non-uniqueness of eigenvectors under scaling // Z should be orthonormal, if it is then Z^T*Z should be the identity if(nn > 0) { std::vector ZZres(nn * nn, 0.0); std::vector I(nn * nn, 0.0); for(rocblas_int i = 0; i < nn; i++) I[i + i * nn] = T(1); cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, n, T(1), hZRes[0], ldz, hZRes[0], ldz, T(0), ZZres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), ZZres.data()); *max_err = err > *max_err ? err : *max_err; } // for each of the nev eigenvalues w_j, verify that the associated eigenvector is in the // null space of (A - w_i * I) T alpha, t1, t2; for(int j = 0; j < nn; j++) { for(int i = 0; i < n; i++) { alpha = hW[0][j] - hD[0][i]; hZ[0][i + j * ldz] = hZRes[0][i + j * ldz] * alpha; } t1 = hZRes[0][j * ldz]; hZRes[0][j * ldz] = hE[0][0] * hZRes[0][1 + j * ldz]; for(int i = 1; i < n - 1; i++) { t2 = hZRes[0][i + j * ldz]; hZRes[0][i + j * ldz] = hE[0][i - 1] * t1 + hE[0][i] * hZRes[0][(i + 1) + j * ldz]; t1 = t2; } hZRes[0][(n - 1) + j * ldz] = hE[0][n - 2] * t1; } // error is then ||hZ - hZRes|| / ||hZ|| // using frobenius norm err = norm_error('F', n, nn, ldz, hZ[0], hZRes[0]); *max_err = err > *max_err ? err : *max_err; } else { // check ifail err = 0; for(int j = 0; j < hInfo[0][0]; j++) { EXPECT_NE(hIfailRes[0][j], 0) << "j = " << j; if(hIfailRes[0][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } template void stein_getPerfData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nev, Sd& dD, Sd& dE, Ud& dNev, Sd& dW, Ud& dIblock, Ud& dIsplit, Td& dZ, const rocblas_int ldz, Ud& dIfail, Ud& dInfo, Sh& hD, Sh& hE, Uh& hNev, Sh& hW, Uh& hIblock, Uh& hIsplit, Th& hZ, Uh& hIfail, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using S = decltype(std::real(T{})); size_t lwork = 5 * n; size_t liwork = n; size_t lifail = n; std::vector work(lwork); std::vector iwork(liwork); if(!perf) { stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_stein(n, hD[0], hE[0], hNev[0], hW[0], hIblock[0], hIsplit[0], hZ[0], ldz, work.data(), iwork.data(), hIfail[0], hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); // cold calls for(int iter = 0; iter < 2; iter++) { stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); CHECK_ROCBLAS_ERROR(rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { stein_initData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, hD, hE, hNev, hW, hIblock, hIsplit); start = get_time_us_sync(stream); rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_stein(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nev = argus.get("nev", n < 5 ? n : 5); rocblas_int ldz = argus.get("ldz", n); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = size_D; size_t size_W = size_D; size_t size_iblock = size_D; size_t size_isplit = size_D; size_t size_Z = ldz * n; size_t size_ifail = size_D; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; size_t size_ifailRes = (argus.unit_check || argus.norm_check) ? size_ifail : 0; // check invalid sizes bool invalid_size = (n < 0 || ldz < n); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_stein(handle, n, (S*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_stein(handle, n, (S*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (S*)nullptr, (rocblas_int*)nullptr, (rocblas_int*)nullptr, (T*)nullptr, ldz, (rocblas_int*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations // host host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hNev(1, 1, 1, 1); host_strided_batch_vector hW(size_W, 1, size_W, 1); host_strided_batch_vector hIblock(size_iblock, 1, size_iblock, 1); host_strided_batch_vector hIsplit(size_isplit, 1, size_isplit, 1); host_strided_batch_vector hZ(size_Z, 1, size_Z, 1); host_strided_batch_vector hZRes(size_ZRes, 1, size_ZRes, 1); host_strided_batch_vector hIfail(size_ifail, 1, size_ifail, 1); host_strided_batch_vector hIfailRes(size_ifailRes, 1, size_ifailRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); // device device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dW(size_W, 1, size_W, 1); device_strided_batch_vector dIblock(size_iblock, 1, size_iblock, 1); device_strided_batch_vector dIsplit(size_isplit, 1, size_isplit, 1); device_strided_batch_vector dZ(size_Z, 1, size_Z, 1); device_strided_batch_vector dIfail(size_ifail, 1, size_ifail, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_iblock) CHECK_HIP_ERROR(dIblock.memcheck()); if(size_isplit) CHECK_HIP_ERROR(dIsplit.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); if(size_ifail) CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_stein(handle, n, dD.data(), dE.data(), dNev.data(), dW.data(), dIblock.data(), dIsplit.data(), dZ.data(), ldz, dIfail.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) stein_getError(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo, hD, hE, hNev, hW, hIblock, hIsplit, hZ, hZRes, hIfail, hIfailRes, hInfo, hInfoRes, &max_error); // collect performance data if(argus.timing) stein_getPerfData(handle, n, nev, dD, dE, dNev, dW, dIblock, dIsplit, dZ, ldz, dIfail, dInfo, hD, hE, hNev, hW, hIblock, hIsplit, hZ, hIfail, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n", "nev", "ldz"); rocsolver_bench_output(n, nev, ldz); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEIN(...) extern template void testing_stein<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEIN, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_steqr.cpp000066400000000000000000000032441503202240500245330ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_steqr.hpp" #define TESTING_STEQR(...) template void testing_steqr<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STEQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_steqr.hpp000066400000000000000000000411371503202240500245430ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void steqr_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, S dD, S dE, T dC, const rocblas_int ldc, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_steqr(nullptr, evect, n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, rocblas_evect(0), n, dD, dE, dC, ldc, dInfo), rocblas_status_invalid_value); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, (S) nullptr, dE, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, dD, (S) nullptr, dC, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, dD, dE, (T) nullptr, ldc, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, dD, dE, dC, ldc, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_steqr(handle, evect, 0, (S) nullptr, (S) nullptr, (T) nullptr, ldc, dInfo), rocblas_status_success); } template void testing_steqr_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_int n = 2; rocblas_int ldc = 2; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments steqr_checkBadArgs(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); } template void steqr_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo) { if(CPU) { using S = decltype(std::real(T{})); rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add random splits for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 400; hE[0][i] -= 5; } // add fixed splits in the matrix to test split handling rocblas_int k = n / 2; hE[0][k] = 0; hE[0][k - 1] = 0; // initialize C to the identity matrix if(evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hC[0][i + j * ldc] = 1; else hC[0][i + j * ldc] = 0; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void steqr_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hDRes, Sh& hE, Sh& hERes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err) { using S = decltype(std::real(T{})); size_t lwork = (evect == rocblas_evect_none ? 0 : 2 * n - 2); std::vector work(lwork); // input data initialization steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hCRes.transfer_from(dC)); // if eigenvectors were required, prepare matrix A (upper triangular) for implicit tests rocblas_int lda = n; size_t size_A = lda * n; host_strided_batch_vector hA(size_A, 1, size_A, 1); if(evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[0][i + j * lda] = hD[0][i]; else if(i + 1 == j) hA[0][i + j * lda] = hE[0][i]; else hA[0][i + j * lda] = 0; } } } // CPU lapack cpu_steqr(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), hInfo[0]); // check info EXPECT_EQ(hInfo[0][0], hInfoRes[0][0]); if(hInfo[0][0] != hInfoRes[0][0]) *max_err = 1; else *max_err = 0; double err; if(hInfo[0][0] == 0) { // check that eigenvalues are correct and in order // error is ||hD - hDRes|| / ||hD|| // using frobenius norm err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); *max_err = err > *max_err ? err : *max_err; // check eigenvectors if required if(evect != rocblas_evect_none) { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[0][j]; cpu_symv_hemv(rocblas_fill_upper, n, alpha, hA[0], lda, hCRes[0] + j * ldc, 1, beta, hC[0] + j * ldc, 1); } // error is ||hC - hCRes|| / ||hC|| // using frobenius norm err = norm_error('F', n, n, ldc, hC[0], hCRes[0]); *max_err = err > *max_err ? err : *max_err; } } } template void steqr_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Sd& dD, Sd& dE, Td& dC, const rocblas_int ldc, Ud& dInfo, Sh& hD, Sh& hE, Th& hC, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { using S = decltype(std::real(T{})); size_t lwork = (evect == rocblas_evect_none ? 0 : 2 * n - 2); std::vector work(lwork); if(!perf) { steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_steqr(evect, n, hD[0], hE[0], hC[0], ldc, work.data(), hInfo[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); CHECK_ROCBLAS_ERROR( rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { steqr_initData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo); start = get_time_us_sync(stream); rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_steqr(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); rocblas_int n = argus.get("n"); rocblas_int ldc = argus.get("ldc", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_int hot_calls = argus.iters; // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; size_t size_C = ldc * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || (evect != rocblas_evect_none && ldc < n)); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_steqr(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_steqr(handle, evect, n, (S*)nullptr, (S*)nullptr, (T*)nullptr, ldc, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); host_strided_batch_vector hInfoRes(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_steqr(handle, evect, n, dD.data(), dE.data(), dC.data(), ldc, dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) steqr_getError(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hDRes, hE, hERes, hC, hCRes, hInfo, hInfoRes, &max_error); // collect performance data if(argus.timing) steqr_getPerfData(handle, evect, n, dD, dE, dC, ldc, dInfo, hD, hE, hC, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("evect", "n", "ldc"); rocsolver_bench_output(evectC, n, ldc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STEQR(...) extern template void testing_steqr<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STEQR, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_sterf.cpp000066400000000000000000000032421503202240500245160ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sterf.hpp" #define TESTING_STERF(...) template void testing_sterf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_STERF, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/auxiliary/testing_sterf.hpp000066400000000000000000000272401503202240500245270ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sterf_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dD, T dE, U dInfo) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sterf(nullptr, n, dD, dE, dInfo), rocblas_status_invalid_handle); // values // N/A // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, (T) nullptr, dE, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, dD, (T) nullptr, dInfo), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, dD, dE, (U) nullptr), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, 0, (T) nullptr, (T) nullptr, dInfo), rocblas_status_success); } template void testing_sterf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 2; // memory allocations device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sterf_checkBadArgs(handle, n, dD.data(), dE.data(), dInfo.data()); } template void sterf_initData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Ud& dInfo, Th& hD, Th& hE, Uh& hInfo) { if(CPU) { rocblas_init(hD, true); rocblas_init(hE, true); // scale matrix and add random splits for(rocblas_int i = 0; i < n; i++) { hD[0][i] += 400; hE[0][i] -= 5; } // add fixed splits in the matrix to test split handling rocblas_int k = n / 2; hE[0][k] = 0; hE[0][k - 1] = 0; } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dD.transfer_from(hD)); CHECK_HIP_ERROR(dE.transfer_from(hE)); } } template void sterf_getError(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Ud& dInfo, Th& hD, Th& hDRes, Th& hE, Th& hERes, Uh& hInfo, double* max_err) { // input data initialization sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data())); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hERes.transfer_from(dE)); // CPU lapack cpu_sterf(n, hD[0], hE[0]); // error is ||hD - hDRes|| / ||hD|| // using frobenius norm *max_err = norm_error('F', 1, n, 1, hD[0], hDRes[0]); } template void sterf_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dD, Td& dE, Ud& dInfo, Th& hD, Th& hE, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); cpu_sterf(n, hD[0], hE[0]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); // cold calls for(int iter = 0; iter < 2; iter++) { sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); CHECK_ROCBLAS_ERROR(rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data())); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sterf_initData(handle, n, dD, dE, dInfo, hD, hE, hInfo); start = get_time_us_sync(stream); rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data()); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sterf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int hot_calls = argus.iters; if(argus.alg_mode == 1) { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid), rocblas_status_success); rocsolver_alg_mode alg_mode; EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode), rocblas_status_success); EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); } // check non-supported values // N/A // determine sizes size_t size_D = n; size_t size_E = n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; size_t size_ERes = (argus.unit_check || argus.norm_check) ? size_E : 0; // check invalid sizes bool invalid_size = (n < 0); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_sterf(handle, n, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_sterf(handle, n, (T*)nullptr, (T*)nullptr, (rocblas_int*)nullptr)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hD(size_D, 1, size_D, 1); host_strided_batch_vector hDRes(size_DRes, 1, size_DRes, 1); host_strided_batch_vector hE(size_E, 1, size_E, 1); host_strided_batch_vector hERes(size_ERes, 1, size_ERes, 1); host_strided_batch_vector hInfo(1, 1, 1, 1); device_strided_batch_vector dD(size_D, 1, size_D, 1); device_strided_batch_vector dE(size_E, 1, size_E, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sterf(handle, n, dD.data(), dE.data(), dInfo.data()), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sterf_getError(handle, n, dD, dE, dInfo, hD, hDRes, hE, hERes, hInfo, &max_error); // collect performance data if(argus.timing) sterf_getPerfData(handle, n, dD, dE, dInfo, hD, hE, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("n"); rocsolver_bench_output(n); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_STERF(...) extern template void testing_sterf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_STERF, FOREACH_REAL_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/containers/000077500000000000000000000000001503202240500212675ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/common/containers/d_vector.hpp000066400000000000000000000060021503202240500236030ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include #include #include #include #include #include "common/misc/data_initializer.hpp" #include "common/misc/rocblas_test.hpp" #include "common_host_helpers.hpp" /* ============================================================================================ */ /*! \brief base-class to allocate/deallocate device memory */ template class d_vector { private: size_t size, bytes; public: inline size_t nmemb() const noexcept { return size; } d_vector(size_t s) : size(s) , bytes(s ? s * sizeof(T) : sizeof(T)) { } T* device_vector_setup() { T* d = nullptr; if((hipMalloc)(&d, bytes) != hipSuccess) { fmt::print(stderr, "Error allocating {} bytes ({} GB)\n", bytes, bytes >> 30); d = nullptr; } if(d != nullptr) { auto status = (hipMemset)(d, 0, bytes); if(status != hipSuccess) { fmt::print(stderr, "error: {} ({}) at {}:{}\n", hipGetErrorString(status), static_cast(status), __FILE__, __LINE__); rocblas_abort(); } } return d; } void device_vector_check(T* d) {} void device_vector_teardown(T* d) { if(d != nullptr) { // Free device memory CHECK_HIP_ERROR((hipFree)(d)); } } }; rocSOLVER-rocm-6.4.3/clients/common/containers/device_batch_vector.hpp000066400000000000000000000206651503202240500257730ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "d_vector.hpp" // // Local declaration of the host strided batch vector. // template class host_batch_vector; //! //! @brief pseudo-vector subclass which uses a batch of device memory pointers //! and //! - an array of pointers in host memory //! - an array of pointers in device memory //! template class device_batch_vector : private d_vector { public: using value_type = T; public: //! //! @brief Disallow copying. //! device_batch_vector(const device_batch_vector&) = delete; //! //! @brief Disallow assigning. //! device_batch_vector& operator=(const device_batch_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param batch_count The batch count. //! explicit device_batch_vector(int64_t n, int64_t inc, int64_t batch_count) : d_vector(size_t(n) * std::abs(inc)) , m_n(n) , m_inc(inc) , m_batch_count(batch_count) { if(false == this->try_initialize_memory()) { this->free_memory(); } } //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride (UNUSED) The stride. //! @param batch_count The batch count. //! explicit device_batch_vector(int64_t n, int64_t inc, rocblas_stride stride, int64_t batch_count) : device_batch_vector(n, inc, batch_count) { } //! //! @brief Constructor (kept for backward compatibility only, to be removed). //! @param batch_count The number of vectors. //! @param size_vector The size of each vectors. //! explicit device_batch_vector(int64_t batch_count, size_t size_vector) : device_batch_vector(size_vector, 1, batch_count) { } //! //! @brief Destructor. //! ~device_batch_vector() { this->free_memory(); } //! //! @brief Returns the length of the vector. //! int64_t n() const { return this->m_n; } //! //! @brief Returns the increment of the vector. //! int64_t inc() const { return this->m_inc; } //! //! @brief Returns the value of batch_count. //! int64_t batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride value. //! rocblas_stride stride() const { return 0; } //! //! @brief Access to device data. //! @return Pointer to the device data. //! T** ptr_on_device() { return this->m_device_data; } //! //! @brief Const access to device data. //! @return Const pointer to the device data. //! const T* const* ptr_on_device() const { return this->m_device_data; } T* const* data() { return this->m_device_data; } const T* const* data() const { return this->m_device_data; } //! //! @brief Random access. //! @param batch_index The batch index. //! @return Pointer to the array on device. //! T* operator[](int64_t batch_index) { return this->m_data[batch_index]; } //! //! @brief Constant random access. //! @param batch_index The batch index. //! @return Constant pointer to the array on device. //! const T* operator[](int64_t batch_index) const { return this->m_data[batch_index]; } //! //! @brief Const cast of the data on host. //! operator const T* const *() const { return this->m_data; } // clang-format off //! //! @brief Cast of the data on host. //! operator T**() { return this->m_data; } // clang-format on //! //! @brief Tell whether ressources allocation failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Copy from a host batched vector. //! @param that The host_batch_vector to copy. //! hipError_t transfer_from(const host_batch_vector& that) { hipError_t hip_err; // // Copy each vector. // for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(hipSuccess != (hip_err = hipMemcpy((*this)[batch_index], that[batch_index], sizeof(T) * this->nmemb(), hipMemcpyHostToDevice))) { return hip_err; } } return hipSuccess; } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { if(*this) return hipSuccess; else return hipErrorOutOfMemory; } private: int64_t m_n{}; int64_t m_inc{}; int64_t m_batch_count{}; T** m_data{}; T** m_device_data{}; //! //! @brief Try to allocate the ressources. //! @return true if success false otherwise. //! bool try_initialize_memory() { bool success = false; success = (hipSuccess == (hipMalloc)(&this->m_device_data, this->m_batch_count * sizeof(T*))); if(success) { success = (nullptr != (this->m_data = (T**)calloc(this->m_batch_count, sizeof(T*)))); if(success) { for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { success = (nullptr != (this->m_data[batch_index] = this->device_vector_setup())); if(!success) { break; } } if(success) { success = (hipSuccess == hipMemcpy(this->m_device_data, this->m_data, sizeof(T*) * this->m_batch_count, hipMemcpyHostToDevice)); } } } return success; } //! //! @brief Free the ressources, as much as we can. //! void free_memory() { if(nullptr != this->m_data) { for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(nullptr != this->m_data[batch_index]) { this->device_vector_teardown(this->m_data[batch_index]); this->m_data[batch_index] = nullptr; } } free(this->m_data); this->m_data = nullptr; } if(nullptr != this->m_device_data) { auto tmp_device_data = this->m_device_data; this->m_device_data = nullptr; CHECK_HIP_ERROR((hipFree)(tmp_device_data)); } } }; rocSOLVER-rocm-6.4.3/clients/common/containers/device_strided_batch_vector.hpp000066400000000000000000000157431503202240500275120ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once // // Local declaration of the host strided batch vector. // template class host_strided_batch_vector; //! //! @brief Implementation of a strided batched vector on device. //! template class device_strided_batch_vector : public d_vector { public: using value_type = T; public: //! //! @brief The storage type to use. //! typedef enum class estorage { block, interleave, } storage; //! //! @brief Disallow copying. //! device_strided_batch_vector(const device_strided_batch_vector&) = delete; //! //! @brief Disallow assigning. //! device_strided_batch_vector& operator=(const device_strided_batch_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride The stride. //! @param batch_count The batch count. //! @param stg The storage format to use. //! explicit device_strided_batch_vector(int64_t n, int64_t inc, rocblas_stride stride, int64_t batch_count, storage stg = storage::block) : d_vector(calculate_nmemb(n, inc, stride, batch_count, stg)) , m_storage(stg) , m_n(n) , m_inc(inc) , m_stride(stride) , m_batch_count(batch_count) { bool valid_parameters = true; switch(this->m_storage) { case storage::block: { if(std::abs(this->m_stride) < this->m_n * std::abs(this->m_inc)) { valid_parameters = false; } break; } case storage::interleave: { if(std::abs(this->m_inc) < std::abs(this->m_stride) * this->m_batch_count) { valid_parameters = false; } break; } } if(valid_parameters) { this->m_data = this->device_vector_setup(); } } //! //! @brief Destructor. //! ~device_strided_batch_vector() { if(nullptr != this->m_data) { this->device_vector_teardown(this->m_data); this->m_data = nullptr; } } //! //! @brief Returns the data pointer. //! T* data() { return this->m_data; } //! //! @brief Returns the data pointer. //! const T* data() const { return this->m_data; } //! //! @brief Returns the length. //! int64_t n() const { return this->m_n; } //! //! @brief Returns the increment. //! int64_t inc() const { return this->m_inc; } //! //! @brief Returns the batch count. //! int64_t batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride value. //! rocblas_stride stride() const { return this->m_stride; } //! //! @brief Random access. //! @param batch_index The batch index. //! @return Pointer to the array on device. //! T* operator[](int64_t batch_index) { return (this->m_stride >= 0) ? this->m_data + batch_index * this->m_stride : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Constant random access. //! @param batch_index The batch index. //! @return Constant pointer to the array on device. //! const T* operator[](int64_t batch_index) const { return (this->m_stride >= 0) ? this->m_data + batch_index * this->m_stride : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Cast operator. //! @remark Returns the pointer of the first vector. //! operator T*() { return (*this)[0]; } //! //! @brief Non-mutable cast operator. //! @remark Returns the non-mutable pointer of the first vector. //! operator const T*() const { return (*this)[0]; } //! //! @brief Tell whether ressources allocation failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Transfer data from a strided batched vector on device. //! @param that That strided batched vector on device. //! @return The hip error. //! hipError_t transfer_from(const host_strided_batch_vector& that) { return hipMemcpy(this->data(), that.data(), sizeof(T) * this->nmemb(), hipMemcpyHostToDevice); } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { if(*this) return hipSuccess; else return hipErrorOutOfMemory; } private: storage m_storage{storage::block}; int64_t m_n{}; int64_t m_inc{}; rocblas_stride m_stride{}; int64_t m_batch_count{}; T* m_data{}; static size_t calculate_nmemb(int64_t n, int64_t inc, rocblas_stride stride, int64_t batch_count, storage st) { switch(st) { case storage::block: return size_t(std::abs(stride)) * batch_count; case storage::interleave: return size_t(n) * std::abs(inc); } return 0; } }; rocSOLVER-rocm-6.4.3/clients/common/containers/host_batch_vector.hpp000066400000000000000000000175151503202240500255110ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include #include #include "common/misc/data_initializer.hpp" // // Local declaration of the device batch vector. // template class device_batch_vector; //! //! @brief Implementation of the batch vector on host. //! template class host_batch_vector { public: using value_type = T; public: //! //! @brief Delete copy constructor. //! host_batch_vector(const host_batch_vector& that) = delete; //! //! @brief Delete copy assignement. //! host_batch_vector& operator=(const host_batch_vector& that) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param batch_count The batch count. //! explicit host_batch_vector(int64_t n, int64_t inc, int64_t batch_count) : m_n(n) , m_inc(inc) , m_batch_count(batch_count) { if(false == this->try_initialize_memory()) { this->free_memory(); } } //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride (UNUSED) The stride. //! @param batch_count The batch count. //! explicit host_batch_vector(int64_t n, int64_t inc, rocblas_stride stride, int64_t batch_count) : host_batch_vector(n, inc, batch_count) { } //! //! @brief Destructor. //! ~host_batch_vector() { this->free_memory(); } //! //! @brief Returns the length of the vector. //! int64_t n() const { return this->m_n; } //! //! @brief Returns the increment of the vector. //! int64_t inc() const { return this->m_inc; } //! //! @brief Returns the batch count. //! int64_t batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride value. //! rocblas_stride stride() const { return 0; } //! //! @brief Random access. //! @param batch_index The batch index. //! @return Pointer to the array on host. //! T* operator[](int64_t batch_index) { return this->m_data[batch_index]; } //! //! @brief Constant random access. //! @param batch_index The batch index. //! @return Constant pointer to the array on host. //! const T* operator[](int64_t batch_index) const { return this->m_data[batch_index]; } // clang-format off //! //! @brief Cast to a double pointer. //! operator T**() { return this->m_data; } // clang-format on //! //! @brief Constant cast to a double pointer. //! operator const T* const *() { return this->m_data; } //! //! @brief Copy from a host batched vector. //! @param that the vector the data is copied from. //! @return true if the copy is done successfully, false otherwise. //! bool copy_from(const host_batch_vector& that) { if((this->batch_count() == that.batch_count()) && (this->n() == that.n()) && (this->inc() == that.inc())) { size_t num_bytes = this->n() * std::abs(this->inc()) * sizeof(T); for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { memcpy((*this)[batch_index], that[batch_index], num_bytes); } return true; } else { return false; } } //! //! @brief Transfer from a device batched vector. //! @param that the vector the data is copied from. //! @return the hip error. //! hipError_t transfer_from(const device_batch_vector& that) { hipError_t hip_err; size_t num_bytes = size_t(this->m_n) * std::abs(this->m_inc) * sizeof(T); for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(hipSuccess != (hip_err = hipMemcpy((*this)[batch_index], that[batch_index], num_bytes, hipMemcpyDeviceToHost))) { return hip_err; } } return hipSuccess; } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { return (nullptr != this->m_data) ? hipSuccess : hipErrorOutOfMemory; } private: int64_t m_n{}; int64_t m_inc{}; int64_t m_batch_count{}; T** m_data{}; bool try_initialize_memory() { bool success = (nullptr != (this->m_data = (T**)calloc(this->m_batch_count, sizeof(T*)))); if(success) { size_t nmemb = size_t(this->m_n) * std::abs(this->m_inc); for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { success = (nullptr != (this->m_data[batch_index] = (T*)calloc(nmemb, sizeof(T)))); if(false == success) { break; } } } return success; } void free_memory() { if(nullptr != this->m_data) { for(int64_t batch_index = 0; batch_index < this->m_batch_count; ++batch_index) { if(nullptr != this->m_data[batch_index]) { free(this->m_data[batch_index]); this->m_data[batch_index] = nullptr; } } free(this->m_data); this->m_data = nullptr; } } }; //! //! @brief Overload output operator. //! @param os The ostream. //! @param that That host batch vector. //! template std::ostream& operator<<(std::ostream& os, const host_batch_vector& that) { auto n = that.n(); auto inc = std::abs(that.inc()); auto batch_count = that.batch_count(); for(int64_t batch_index = 0; batch_index < batch_count; ++batch_index) { auto batch_data = that[batch_index]; os << "[" << batch_index << "] = { " << batch_data[0]; for(int64_t i = 1; i < n; ++i) { os << ", " << batch_data[i * inc]; } os << " }" << std::endl; } return os; } rocSOLVER-rocm-6.4.3/clients/common/containers/host_strided_batch_vector.hpp000066400000000000000000000211151503202240500272160ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include // // Local declaration of the device strided batch vector. // template class device_strided_batch_vector; //! //! @brief Implementation of a host strided batched vector. //! template class host_strided_batch_vector { public: using value_type = T; public: //! //! @brief The storage type to use. //! typedef enum class estorage { block, interleave } storage; //! //! @brief Disallow copying. //! host_strided_batch_vector(const host_strided_batch_vector&) = delete; //! //! @brief Disallow assigning. //! host_strided_batch_vector& operator=(const host_strided_batch_vector&) = delete; //! //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. //! @param stride The stride. //! @param batch_count The batch count. //! @param stg The storage format to use. //! explicit host_strided_batch_vector(int64_t n, int64_t inc, rocblas_stride stride, int64_t batch_count, storage stg = storage::block) : m_storage(stg) , m_n(n) , m_inc(inc) , m_stride(stride) , m_batch_count(batch_count) , m_nmemb(calculate_nmemb(n, inc, stride, batch_count, stg)) { bool valid_parameters = this->m_nmemb > 0; if(valid_parameters) { switch(this->m_storage) { case storage::block: { if(std::abs(this->m_stride) < this->m_n * std::abs(this->m_inc)) { valid_parameters = false; } break; } case storage::interleave: { if(std::abs(this->m_inc) < std::abs(this->m_stride) * this->m_batch_count) { valid_parameters = false; } break; } } if(valid_parameters) { // Value-initialization (`new T{}` or `new T[]{}`) of a non-class type yields zero-initialization this->m_data = new T[this->m_nmemb]{}; } } } //! //! @brief Destructor. //! ~host_strided_batch_vector() { if(nullptr != this->m_data) { delete[] this->m_data; this->m_data = nullptr; } } //! //! @brief Returns the data pointer. //! T* data() { return this->m_data; } //! //! @brief Returns the data pointer. //! const T* data() const { return this->m_data; } //! //! @brief Returns the length. //! int64_t n() const { return this->m_n; } //! //! @brief Returns the increment. //! int64_t inc() const { return this->m_inc; } //! //! @brief Returns the batch count. //! int64_t batch_count() const { return this->m_batch_count; } //! //! @brief Returns the stride. //! rocblas_stride stride() const { return this->m_stride; } //! //! @brief Random access. //! @param batch_index The batch index. //! @return Pointer to the array on host. //! T* operator[](int64_t batch_index) { return (this->m_stride >= 0) ? this->m_data + this->m_stride * batch_index : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Constant random access. //! @param batch_index The batch index. //! @return Constant pointer to the array on host. //! const T* operator[](int64_t batch_index) const { return (this->m_stride >= 0) ? this->m_data + this->m_stride * batch_index : this->m_data + (batch_index + 1 - this->m_batch_count) * this->m_stride; } //! //! @brief Cast operator. //! @remark Returns the pointer of the first vector. //! operator T*() { return (*this)[0]; } //! //! @brief Non-mutable cast operator. //! @remark Returns the non-mutable pointer of the first vector. //! operator const T*() const { return (*this)[0]; } //! //! @brief Tell whether ressources allocation failed. //! explicit operator bool() const { return nullptr != this->m_data; } //! //! @brief Copy data from a strided batched vector on host. //! @param that That strided batched vector on host. //! @return true if successful, false otherwise. //! bool copy_from(const host_strided_batch_vector& that) { if(that.n() == this->m_n && that.inc() == this->m_inc && that.stride() == this->m_stride && that.batch_count() == this->m_batch_count) { memcpy(this->data(), that.data(), sizeof(T) * this->m_nmemb); return true; } else { return false; } } //! //! @brief Transfer data from a strided batched vector on device. //! @param that That strided batched vector on device. //! @return The hip error. //! template hipError_t transfer_from(const device_strided_batch_vector& that) { return hipMemcpy(this->m_data, that.data(), sizeof(T) * this->m_nmemb, hipMemcpyDeviceToHost); } //! //! @brief Check if memory exists. //! @return hipSuccess if memory exists, hipErrorOutOfMemory otherwise. //! hipError_t memcheck() const { return ((bool)*this) ? hipSuccess : hipErrorOutOfMemory; } //! //! @brief Get size of vector //! @return number of elements //! size_t size() const { return this->m_nmemb; } private: storage m_storage{storage::block}; int64_t m_n{}; int64_t m_inc{}; rocblas_stride m_stride{}; int64_t m_batch_count{}; size_t m_nmemb{}; T* m_data{}; static size_t calculate_nmemb(int64_t n, int64_t inc, rocblas_stride stride, int64_t batch_count, storage st) { switch(st) { case storage::block: return size_t(std::abs(stride)) * batch_count; case storage::interleave: return size_t(n) * std::abs(inc); } return 0; } }; //! //! @brief Overload output operator. //! @param os The ostream. //! @param that That host strided batch vector. //! template std::ostream& operator<<(std::ostream& os, const host_strided_batch_vector& that) { auto n = that.n(); auto inc = std::abs(that.inc()); auto batch_count = that.batch_count(); for(int64_t batch_index = 0; batch_index < batch_count; ++batch_index) { auto batch_data = that[batch_index]; os << "[" << batch_index << "] = { " << batch_data[0]; for(int64_t i = 1; i < n; ++i) { os << ", " << batch_data[i * inc]; } os << " }" << std::endl; } return os; } rocSOLVER-rocm-6.4.3/clients/common/containers/rocblas_vector.hpp000066400000000000000000000113531503202240500250120ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "device_batch_vector.hpp" #include "device_strided_batch_vector.hpp" #include "host_batch_vector.hpp" #include "host_strided_batch_vector.hpp" //! //! @brief Random number with type deductions. //! template void random_generator(T& n) { n = random_generator(); } //! //! //! template void random_nan_generator(T& n) { n = T(rocblas_nan_rng()); } //! //! @brief Template for initializing a host //! (non_batched|batched|strided_batched)vector. //! @param that That vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_template(U& that, bool seedReset = false) { if(seedReset) { rocblas_seedrand(); } for(int64_t batch_index = 0; batch_index < that.batch_count(); ++batch_index) { auto batched_data = that[batch_index]; auto inc = std::abs(that.inc()); auto n = that.n(); if(inc < 0) { batched_data -= (n - 1) * inc; } for(int64_t i = 0; i < n; ++i) { random_generator(batched_data[i * inc]); } } } //! //! @brief Template for initializing a host //! (non_batched|batched|strided_batched)vector with NaNs. //! @param that That vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan_template(U& that, bool seedReset = false) { if(seedReset) { rocblas_seedrand(); } for(int64_t batch_index = 0; batch_index < that.batch_count(); ++batch_index) { auto batched_data = that[batch_index]; auto inc = std::abs(that.inc()); auto n = that.n(); if(inc < 0) { batched_data -= (n - 1) * inc; } for(int64_t i = 0; i < n; ++i) { random_nan_generator(batched_data[i * inc]); } } } //! //! @brief Initialize a host_strided_batch_vector. //! @param that The host strided batch vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init(host_strided_batch_vector& that, bool seedReset = false) { rocblas_init_template(that, seedReset); } //! //! @brief Initialize a host_batch_vector. //! @param that The host batch vector. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init(host_batch_vector& that, bool seedReset = false) { rocblas_init_template(that, seedReset); } //! //! @brief Initialize a host_strided_batch_vector with NaNs. //! @param that The host strided batch vector to be initialized. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan(host_strided_batch_vector& that, bool seedReset = false) { rocblas_init_nan_template(that, seedReset); } //! //! @brief Initialize a host_strided_batch_vector with NaNs. //! @param that The host strided batch vector to be initialized. //! @param seedReset reset the seed if true, do not reset the seed otherwise. //! template void rocblas_init_nan(host_batch_vector& that, bool seedReset = false) { rocblas_init_nan_template(that, seedReset); } rocSOLVER-rocm-6.4.3/clients/common/lapack/000077500000000000000000000000001503202240500203555ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gebd2_gebrd.cpp000066400000000000000000000034411503202240500247460ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gebd2_gebrd.hpp" #define TESTING_GEBD2_GEBRD(...) template void testing_gebd2_gebrd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBD2_GEBRD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gebd2_gebrd.hpp000066400000000000000000000674011503202240500247610ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gebd2_gebrd_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dTauq, const rocblas_stride stQ, U dTaup, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, nullptr, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dTauq, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, stQ, dTaup, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, 0, n, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, (U) nullptr, stQ, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, (U) nullptr, stQ, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, 0), rocblas_status_success); } template void testing_gebd2_gebrd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_stride stQ = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTauq(1, 1, 1, 1); device_strided_batch_vector dTaup(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTauq.memcheck()); CHECK_HIP_ERROR(dTaup.memcheck()); // check bad arguments gebd2_gebrd_checkBadArgs(handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTauq(1, 1, 1, 1); device_strided_batch_vector dTaup(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTauq.memcheck()); CHECK_HIP_ERROR(dTaup.memcheck()); // check bad arguments gebd2_gebrd_checkBadArgs(handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc); } } template void gebd2_gebrd_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTauq, const rocblas_stride stQ, Ud& dTaup, const rocblas_stride stP, const rocblas_int bc, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || (m >= n && j == i + 1) || (m < n && i == j + 1)) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gebd2_gebrd_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTauq, const rocblas_stride stQ, Ud& dTaup, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; constexpr bool VERIFY_IMPLICIT_TEST = false; std::vector hW(std::max(m, n)); // input data initialization gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); // execute computations // use verify_implicit_test to check correctness of the implicit test using // CPU lapack if(!VERIFY_IMPLICIT_TEST) { // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hTauq.transfer_from(dTauq)); CHECK_HIP_ERROR(hTaup.transfer_from(dTaup)); } else { // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { memcpy(hARes[b], hA[b], lda * n * sizeof(T)); GEBRD ? cpu_gebrd(m, n, hARes[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data(), std::max(m, n)) : cpu_gebd2(m, n, hARes[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data()); } } // reconstruct A from the factorization for implicit testing std::vector vec(std::max(m, n)); vec[0] = 1; for(rocblas_int b = 0; b < bc; ++b) { T* a = hARes[b]; T* tauq = hTauq[b]; T* taup = hTaup[b]; if(m >= n) { for(int j = n - 1; j >= 0; j--) { if(j < n - 1) { if(COMPLEX) { cpu_lacgv(1, taup + j, 1); cpu_lacgv(n - j - 1, a + j + (j + 1) * lda, lda); } for(int i = 1; i < n - j - 1; i++) { vec[i] = a[j + (j + i + 1) * lda]; a[j + (j + i + 1) * lda] = 0; } cpu_larf(rocblas_side_right, m - j, n - j - 1, vec.data(), 1, taup + j, a + j + (j + 1) * lda, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, taup + j, 1); } for(int i = 1; i < m - j; i++) { vec[i] = a[(j + i) + j * lda]; a[(j + i) + j * lda] = 0; } cpu_larf(rocblas_side_left, m - j, n - j, vec.data(), 1, tauq + j, a + j + j * lda, lda, hW.data()); } } else { for(int j = m - 1; j >= 0; j--) { if(j < m - 1) { for(int i = 1; i < m - j - 1; i++) { vec[i] = a[(j + i + 1) + j * lda]; a[(j + i + 1) + j * lda] = 0; } cpu_larf(rocblas_side_left, m - j - 1, n - j, vec.data(), 1, tauq + j, a + (j + 1) + j * lda, lda, hW.data()); } if(COMPLEX) { cpu_lacgv(1, taup + j, 1); cpu_lacgv(n - j, a + j + j * lda, lda); } for(int i = 1; i < n - j; i++) { vec[i] = a[j + (j + i) * lda]; a[j + (j + i) * lda] = 0; } cpu_larf(rocblas_side_right, m - j, n - j, vec.data(), 1, taup + j, a + j + j * lda, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, taup + j, 1); } } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void gebd2_gebrd_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTauq, const rocblas_stride stQ, Ud& dTaup, const rocblas_stride stP, const rocblas_int bc, Th& hA, Sh& hD, Sh& hE, Uh& hTauq, Uh& hTaup, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(std::max(m, n)); if(!perf) { gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GEBRD ? cpu_gebrd(m, n, hA[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data(), std::max(m, n)) : cpu_gebd2(m, n, hA[b], lda, hD[b], hE[b], hTauq[b], hTaup[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); // cold calls for(int iter = 0; iter < 2; iter++) { gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); CHECK_ROCBLAS_ERROR(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gebd2_gebrd_initData(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup); start = get_time_us_sync(stream); rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gebd2_gebrd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", std::min(m, n)); rocblas_stride stE = argus.get("strideE", std::min(m, n) - 1); rocblas_stride stQ = argus.get("strideQ", std::min(m, n)); rocblas_stride stP = argus.get("strideP", std::min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = lda * n; size_t size_D = std::min(m, n); size_t size_E = std::min(m, n) - 1; size_t size_Q = std::min(m, n); size_t size_P = std::min(m, n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stQ, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hE(size_E, 1, stE, bc); host_strided_batch_vector hTaup(size_P, 1, stP, bc); host_strided_batch_vector hTauq(size_Q, 1, stQ, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dTauq(size_Q, 1, stQ, bc); device_strided_batch_vector dTaup(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_Q) CHECK_HIP_ERROR(dTauq.memcheck()); if(size_P) CHECK_HIP_ERROR(dTaup.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gebd2_gebrd_getError(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hARes, hD, hE, hTauq, hTaup, &max_error); // collect performance data if(argus.timing) gebd2_gebrd_getPerfData( handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hE(size_E, 1, stE, bc); host_strided_batch_vector hTaup(size_P, 1, stP, bc); host_strided_batch_vector hTauq(size_Q, 1, stQ, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dTauq(size_Q, 1, stQ, bc); device_strided_batch_vector dTaup(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_Q) CHECK_HIP_ERROR(dTauq.memcheck()); if(size_P) CHECK_HIP_ERROR(dTaup.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gebd2_gebrd(STRIDED, GEBRD, handle, m, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTauq.data(), stQ, dTaup.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gebd2_gebrd_getError(handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hARes, hD, hE, hTauq, hTaup, &max_error); // collect performance data if(argus.timing) gebd2_gebrd_getPerfData( handle, m, n, dA, lda, stA, dD, stD, dE, stE, dTauq, stQ, dTaup, stP, bc, hA, hD, hE, hTauq, hTaup, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m*n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBD2_GEBRD(...) \ extern template void testing_gebd2_gebrd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBD2_GEBRD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrf_npvt.cpp000066400000000000000000000033401503202240500254760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_geblttrf_npvt.hpp" #define TESTING_GEBLTTRF_NPVT(...) template void testing_geblttrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBLTTRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrf_npvt.hpp000066400000000000000000000711011503202240500255030ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void geblttrf_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, T dC, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, nullptr, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T) nullptr, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, (T) nullptr, ldb, stB, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, 0, nblocks, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_geblttrf_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int nb = 1; rocblas_int nblocks = 2; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldc = 1; rocblas_stride stA = 2; rocblas_stride stB = 2; rocblas_stride stC = 2; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dC(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments geblttrf_npvt_checkBadArgs(handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments geblttrf_npvt_checkBadArgs(handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc); } } template void geblttrf_npvt_initData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int lda, Td& dB, const rocblas_int ldb, Td& dC, const rocblas_int ldc, const rocblas_int bc, Th& hA, Th& hB, Th& hC, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, false); rocblas_init(hC, false); rocblas_int n = nb * nblocks; for(rocblas_int b = 0; b < bc; ++b) { // scale to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { for(rocblas_int k = 0; k < nblocks; k++) { if(i == j) hB[b][i + j * ldb + k * ldb * nb] += 400; else hB[b][i + j * ldb + k * ldb * nb] -= 4; } for(rocblas_int k = 0; k < nblocks - 1; k++) { hA[b][i + j * lda + k * lda * nb] -= 4; hC[b][i + j * ldc + k * ldc * nb] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes) rocblas_int jj = n / 4 + b; jj -= (jj / n) * n; rocblas_int j = jj % nb; rocblas_int k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column hB[b][i + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) hA[b][i + j * lda + k * lda * nb] = 0; if(k > 0) hC[b][i + j * ldc + (k - 1) * ldc * nb] = 0; } jj = n / 2 + b; jj -= (jj / n) * n; j = jj % nb; k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column hB[b][i + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) hA[b][i + j * lda + k * lda * nb] = 0; if(k > 0) hC[b][i + j * ldc + (k - 1) * ldc * nb] = 0; } jj = n - 1 + b; jj -= (jj / n) * n; j = jj % nb; k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column hB[b][i + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) hA[b][i + j * lda + k * lda * nb] = 0; if(k > 0) hC[b][i + j * ldc + (k - 1) * ldc * nb] = 0; } } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void geblttrf_npvt_getError(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int n = nb * nblocks; std::vector L(n * n); std::vector U(n * n); std::vector M(n * n); std::vector MRes(n * n); // input data initialization geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hCRes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { EXPECT_GT(hInfoRes[b][0], 0) << "where b = " << b; if(hInfoRes[b][0] <= 0) err++; } else { EXPECT_EQ(hInfoRes[b][0], 0) << "where b = " << b; if(hInfoRes[b][0] != 0) err++; } } *max_err += err; for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { // compute diagonal blocks and store in full matrix L for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { if(i <= j) L[i + j * n + k * (n + 1) * nb] = hBRes[b][i + j * ldb + k * ldb * nb]; else L[i + j * n + k * (n + 1) * nb] = 0; } } cpu_trmm(rocblas_side_left, rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, nb, nb, T(1), hBRes[b] + k * ldb * nb, ldb, L.data() + k * (n + 1) * nb, n); } // move blocks A, updated C, and I into full matrices L and U for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { if(k < nblocks - 1) { for(rocblas_int j = 0; j < nb; j++) { U[i + (j + nb) * n + k * (n + 1) * nb] = hCRes[b][i + j * ldc + k * ldc * nb]; L[(i + nb) + j * n + k * (n + 1) * nb] = hA[b][i + j * lda + k * lda * nb]; } } U[i + i * n + k * (n + 1) * nb] = 1; } } // reconstruct input matrix from factors and store it in MRes cpu_gemm(rocblas_operation_none, rocblas_operation_none, n, n, n, T(1), L.data(), n, U.data(), n, T(0), MRes.data(), n); // form original matrix from original blocks for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { M[i + j * n + k * (n + 1) * nb] = hB[b][i + j * ldb + k * ldb * nb]; if(k < nblocks - 1) { M[(i + nb) + j * n + k * (n + 1) * nb] = hA[b][i + j * lda + k * lda * nb]; M[i + (j + nb) * n + k * (n + 1) * nb] = hC[b][i + j * ldc + k * ldc * nb]; } } } } // error is ||M - MRes|| / ||M|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm err = norm_error('F', n, n, n, M.data(), MRes.data()); *max_err = err > *max_err ? err : *max_err; } } } template void geblttrf_npvt_getPerfData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { // there is no direct CPU/LAPACK equivalent for this function, therefore // we return an invalid CPU time *cpu_time_used = nan(""); } geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); // cold calls for(int iter = 0; iter < 2; iter++) { geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); CHECK_ROCBLAS_ERROR(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geblttrf_npvt_initData(handle, nb, nblocks, dA, lda, dB, ldb, dC, ldc, bc, hA, hB, hC, singular); start = get_time_us_sync(stream); rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geblttrf_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int nb = argus.get("nb"); rocblas_int nblocks = argus.get("nblocks"); rocblas_int lda = argus.get("lda", nb); rocblas_int ldb = argus.get("ldb", nb); rocblas_int ldc = argus.get("ldc", nb); rocblas_stride stA = argus.get("strideA", lda * nb * nblocks); rocblas_stride stB = argus.get("strideB", ldb * nb * nblocks); rocblas_stride stC = argus.get("strideC", ldc * nb * nblocks); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; rocblas_stride stCRes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * nb * nblocks; size_t size_B = size_t(ldb) * nb * nblocks; size_t size_C = size_t(ldc) * nb * nblocks; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (nb < 0 || nblocks < 0 || lda < nb || ldb < nb || ldc < nb || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geblttrf_npvt( STRIDED, handle, nb, nblocks, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hC(size_C, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_batch_vector hCRes(size_CRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dC(size_C, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrf_npvt_getError(handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hBRes, hC, hCRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) geblttrf_npvt_getPerfData( handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hC(size_C, 1, stC, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hCRes(size_CRes, 1, stCRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt(STRIDED, handle, nb, nblocks, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrf_npvt_getError(handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hBRes, hC, hCRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) geblttrf_npvt_getPerfData( handle, nb, nblocks, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dInfo, bc, hA, hB, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using nb * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("nb", "nblocks", "lda", "ldb", "ldc", "batch_c"); rocsolver_bench_output(nb, nblocks, lda, ldb, ldc, bc); } else if(STRIDED) { rocsolver_bench_output("nb", "nblocks", "lda", "strideA", "ldb", "strideB", "ldc", "strideC", "batch_c"); rocsolver_bench_output(nb, nblocks, lda, stA, ldb, stB, ldc, stC, bc); } else { rocsolver_bench_output("nb", "nblocks", "lda", "ldb", "ldc"); rocsolver_bench_output(nb, nblocks, lda, ldb, ldc); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBLTTRF_NPVT(...) \ extern template void testing_geblttrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBLTTRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrf_npvt_interleaved.cpp000066400000000000000000000033721503202240500300650ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_geblttrf_npvt_interleaved.hpp" #define TESTING_GEBLTTRF_NPVT_INTERLEAVED(...) \ template void testing_geblttrf_npvt_interleaved<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBLTTRF_NPVT_INTERLEAVED, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrf_npvt_interleaved.hpp000066400000000000000000000705171503202240500300770ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void geblttrf_npvt_interleaved_checkBadArgs(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, T dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, T dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(nullptr, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, (T) nullptr, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, dA, inca, lda, stA, (T) nullptr, incb, ldb, stB, dC, incc, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, (T) nullptr, incc, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, 0, nblocks, (T) nullptr, inca, lda, stA, (T) nullptr, incb, ldb, stB, (T) nullptr, incc, ldc, stC, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_geblttrf_npvt_interleaved(handle, nb, 0, (T) nullptr, inca, lda, stA, (T) nullptr, incb, ldb, stB, (T) nullptr, incc, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_geblttrf_npvt_interleaved_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int nb = 1; rocblas_int nblocks = 2; rocblas_int inca = 1; rocblas_int incb = 1; rocblas_int incc = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldc = 1; rocblas_stride stA = 2; rocblas_stride stB = 2; rocblas_stride stC = 2; rocblas_int bc = 1; // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments geblttrf_npvt_interleaved_checkBadArgs(handle, nb, nblocks, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dInfo.data(), bc); } template void geblttrf_npvt_interleaved_initData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, const rocblas_int bc, Th& hA, Th& hB, Th& hC, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, false); rocblas_init(hC, false); rocblas_int n = nb * nblocks; for(rocblas_int b = 0; b < bc; ++b) { T* A = hA[0] + b * stA; T* B = hB[0] + b * stB; T* C = hC[0] + b * stC; // scale to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { for(rocblas_int k = 0; k < nblocks; k++) { if(i == j) B[i * incb + j * ldb + k * ldb * nb] += 400; else B[i * incb + j * ldb + k * ldb * nb] -= 4; } for(rocblas_int k = 0; k < nblocks - 1; k++) { A[i * inca + j * lda + k * lda * nb] -= 4; C[i * incc + j * ldc + k * ldc * nb] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes) rocblas_int jj = n / 4 + b; jj -= (jj / n) * n; rocblas_int j = jj % nb; rocblas_int k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column B[i * incb + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) A[i * inca + j * lda + k * lda * nb] = 0; if(k > 0) C[i * incc + j * ldc + (k - 1) * ldc * nb] = 0; } jj = n / 2 + b; jj -= (jj / n) * n; j = jj % nb; k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column B[i * incb + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) A[i * inca + j * lda + k * lda * nb] = 0; if(k > 0) C[i * incc + j * ldc + (k - 1) * ldc * nb] = 0; } jj = n - 1 + b; jj -= (jj / n) * n; j = jj % nb; k = jj / nb; for(rocblas_int i = 0; i < nb; i++) { // zero the jj-th column B[i * incb + j * ldb + k * ldb * nb] = 0; if(k < nblocks - 1) A[i * inca + j * lda + k * lda * nb] = 0; if(k > 0) C[i * incc + j * ldc + (k - 1) * ldc * nb] = 0; } } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dC.transfer_from(hC)); } } template void geblttrf_npvt_interleaved_getError(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Th& hC, Th& hCRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int n = nb * nblocks; std::vector Btmp(nb * n); std::vector L(n * n); std::vector U(n * n); std::vector M(n * n); std::vector MRes(n * n); // input data initialization geblttrf_npvt_interleaved_initData(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, bc, hA, hB, hC, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geblttrf_npvt_interleaved( handle, nb, nblocks, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hCRes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { EXPECT_GT(hInfoRes[b][0], 0) << "where b = " << b; if(hInfoRes[b][0] <= 0) err++; } else { EXPECT_EQ(hInfoRes[b][0], 0) << "where b = " << b; if(hInfoRes[b][0] != 0) err++; } } *max_err += err; for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { // compute diagonal blocks and store in full matrix L for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { Btmp[i + j * nb + k * nb * nb] = hBRes[0][i * incb + j * ldb + k * ldb * nb + b * stB]; if(i <= j) L[i + j * n + k * (n + 1) * nb] = Btmp[i + j * nb + k * nb * nb]; else L[i + j * n + k * (n + 1) * nb] = 0; } } cpu_trmm(rocblas_side_left, rocblas_fill_lower, rocblas_operation_none, rocblas_diagonal_unit, nb, nb, T(1), Btmp.data() + k * nb * nb, nb, L.data() + k * (n + 1) * nb, n); } // move blocks A, updated C, and I into full matrices L and U for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { if(k < nblocks - 1) { for(rocblas_int j = 0; j < nb; j++) { U[i + (j + nb) * n + k * (n + 1) * nb] = hCRes[0][i * incc + j * ldc + k * ldc * nb + b * stC]; L[(i + nb) + j * n + k * (n + 1) * nb] = hA[0][i * inca + j * lda + k * lda * nb + b * stA]; } } U[i + i * n + k * (n + 1) * nb] = 1; } } // reconstruct input matrix from factors and store it in MRes cpu_gemm(rocblas_operation_none, rocblas_operation_none, n, n, n, T(1), L.data(), n, U.data(), n, T(0), MRes.data(), n); // form original matrix from original blocks for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { M[i + j * n + k * (n + 1) * nb] = hB[0][i * incb + j * ldb + k * ldb * nb + b * stB]; if(k < nblocks - 1) { M[(i + nb) + j * n + k * (n + 1) * nb] = hA[0][i * inca + j * lda + k * lda * nb + b * stA]; M[i + (j + nb) * n + k * (n + 1) * nb] = hC[0][i * incc + j * ldc + k * ldc * nb + b * stC]; } } } } // error is ||M - MRes|| / ||M|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm err = norm_error('F', n, n, n, M.data(), MRes.data()); *max_err = err > *max_err ? err : *max_err; } } } template void geblttrf_npvt_interleaved_getPerfData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, Td& dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hC, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { // there is no direct CPU/LAPACK equivalent for this function, therefore // we return an invalid CPU time *cpu_time_used = nan(""); } geblttrf_npvt_interleaved_initData(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, bc, hA, hB, hC, singular); // cold calls for(int iter = 0; iter < 2; iter++) { geblttrf_npvt_interleaved_initData(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, bc, hA, hB, hC, singular); CHECK_ROCBLAS_ERROR(rocsolver_geblttrf_npvt_interleaved( handle, nb, nblocks, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geblttrf_npvt_interleaved_initData(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, bc, hA, hB, hC, singular); start = get_time_us_sync(stream); rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geblttrf_npvt_interleaved(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int nb = argus.get("nb"); rocblas_int nblocks = argus.get("nblocks"); rocblas_int inca = argus.get("inca", 1); rocblas_int incb = argus.get("incb", 1); rocblas_int incc = argus.get("incc", 1); rocblas_int lda = argus.get("lda", nb); rocblas_int ldb = argus.get("ldb", nb); rocblas_int ldc = argus.get("ldc", nb); rocblas_stride stA = argus.get("strideA", lda * nb * nblocks); rocblas_stride stB = argus.get("strideB", ldb * nb * nblocks); rocblas_stride stC = argus.get("strideC", ldc * nb * nblocks); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; rocblas_stride stCRes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes rocblas_int n = nb * nblocks; size_t size_A = std::max(size_t(lda) * n, size_t(stA)) * bc; size_t size_B = std::max(size_t(ldb) * n, size_t(stB)) * bc; size_t size_C = std::max(size_t(ldc) * n, size_t(stC)) * bc; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; size_t size_CRes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_a = (inca < 1 || lda < inca * nb); bool invalid_b = (incc < 1 || ldc < incc * nb); bool invalid_c = (incc < 1 || ldc < incc * nb); bool invalid_size = (nb < 0 || nblocks < 0 || bc < 0 || invalid_a || invalid_b || invalid_c); if(invalid_size) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved(handle, nb, nblocks, (T*)nullptr, inca, lda, stA, (T*)nullptr, incb, ldb, stB, (T*)nullptr, incc, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_geblttrf_npvt_interleaved( handle, nb, nblocks, (T*)nullptr, inca, lda, stA, (T*)nullptr, incb, ldb, stB, (T*)nullptr, incc, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hB(size_B, 1, size_B, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hBRes(size_BRes, 1, size_BRes, 1); host_strided_batch_vector hCRes(size_CRes, 1, size_CRes, 1); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dB(size_B, 1, size_B, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrf_npvt_interleaved( handle, nb, nblocks, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrf_npvt_interleaved_getError(handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dInfo, bc, hA, hB, hBRes, hC, hCRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) geblttrf_npvt_interleaved_getPerfData( handle, nb, nblocks, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dInfo, bc, hA, hB, hC, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); // validate results for rocsolver-test // using nb * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("nb", "nblocks", "inca", "lda", "strideA", "incb", "ldb", "strideB", "incc", "ldc", "strideC", "batch_c"); rocsolver_bench_output(nb, nblocks, inca, lda, stA, incb, ldb, stB, incc, ldc, stC, bc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBLTTRF_NPVT_INTERLEAVED(...) \ extern template void testing_geblttrf_npvt_interleaved<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBLTTRF_NPVT_INTERLEAVED, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrs_npvt.cpp000066400000000000000000000033401503202240500255130ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_geblttrs_npvt.hpp" #define TESTING_GEBLTTRS_NPVT(...) template void testing_geblttrs_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBLTTRS_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrs_npvt.hpp000066400000000000000000000672231503202240500255320ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void geblttrs_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, T dC, const rocblas_int ldc, const rocblas_stride stC, T dX, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, nullptr, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T) nullptr, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, (T) nullptr, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, (T) nullptr, ldc, stC, dX, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, 0, nblocks, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, 0, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (T) nullptr, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, 0, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, (T) nullptr, ldx, stX, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, 0), rocblas_status_success); } template void testing_geblttrs_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int nb = 1; rocblas_int nblocks = 2; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldc = 1; rocblas_int ldx = 1; rocblas_stride stA = 2; rocblas_stride stB = 2; rocblas_stride stC = 2; rocblas_stride stX = 2; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dC(1, 1, 1); device_batch_vector dX(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); // check bad arguments geblttrs_npvt_checkBadArgs(handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); // check bad arguments geblttrs_npvt_checkBadArgs(handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc); } } template void geblttrs_npvt_initData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int lda, Td& dB, const rocblas_int ldb, Td& dC, const rocblas_int ldc, Td& dX, const rocblas_int ldx, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hRHS) { if(CPU) { int info; int n = nb * nblocks; std::vector M(n * n); std::vector XX(n * nrhs); std::vector XB(n * nrhs); std::vector ipiv(nb); // initialize blocks of the original matrix rocblas_init(hA, true); rocblas_init(hB, false); rocblas_init(hC, false); // initialize solution vectors rocblas_init(hX, false); for(rocblas_int b = 0; b < bc; ++b) { // form original matrix M and scale to avoid singularities for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { if(i == j) M[i + j * n + k * (n + 1) * nb] = hB[b][i + j * ldb + k * ldb * nb] + 400; else M[i + j * n + k * (n + 1) * nb] = hB[b][i + j * ldb + k * ldb * nb] - 4; if(k < nblocks - 1) { M[(i + nb) + j * n + k * (n + 1) * nb] = hA[b][i + j * lda + k * lda * nb] - 4; M[i + (j + nb) * n + k * (n + 1) * nb] = hC[b][i + j * ldc + k * ldc * nb] - 4; } } } } // move blocks of X to full matrix XX for(rocblas_int k = 0; k < nblocks; k++) for(rocblas_int i = 0; i < nb; i++) for(rocblas_int j = 0; j < nrhs; j++) XX[i + j * n + k * nb] = hX[b][i + j * ldx + k * ldx * nrhs]; // generate the full matrix of right-hand-side vectors XB by computing M * XX cpu_gemm(rocblas_operation_none, rocblas_operation_none, n, nrhs, n, T(1), M.data(), n, XX.data(), n, T(0), XB.data(), n); // move XB to block format in hRHS for(rocblas_int k = 0; k < nblocks; k++) for(rocblas_int i = 0; i < nb; i++) for(rocblas_int j = 0; j < nrhs; j++) hRHS[b][i + j * ldx + k * ldx * nrhs] = XB[i + j * n + k * nb]; // factorize M cpu_getrf(nb, nb, M.data(), n, ipiv.data(), &info); for(rocblas_int k = 0; k < nblocks - 1; k++) { cpu_getrs(rocblas_operation_none, nb, nb, M.data() + k * (n + 1) * nb, n, ipiv.data(), M.data() + nb * n + k * (n + 1) * nb, n); cpu_gemm(rocblas_operation_none, rocblas_operation_none, nb, nb, nb, T(-1), M.data() + nb + k * (n + 1) * nb, n, M.data() + nb * n + k * (n + 1) * nb, n, T(1), M.data() + (k + 1) * (n + 1) * nb, n); cpu_getrf(nb, nb, M.data() + (k + 1) * (n + 1) * nb, n, ipiv.data(), &info); } // move factorized blocks from M into hA, hB, and hC for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { hB[b][i + j * ldb + k * ldb * nb] = M[i + j * n + k * (n + 1) * nb]; if(k < nblocks - 1) { hA[b][i + j * lda + k * lda * nb] = M[(i + nb) + j * n + k * (n + 1) * nb]; hC[b][i + j * ldc + k * ldc * nb] = M[i + (j + nb) * n + k * (n + 1) * nb]; } } } } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dC.transfer_from(hC)); CHECK_HIP_ERROR(dX.transfer_from(hRHS)); } } template void geblttrs_npvt_getError(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hXRes, double* max_err) { // input data initialization geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); double err = 0; *max_err = 0; // error is ||hX - hXRes|| / ||hX|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', nb, nrhs * nblocks, ldx, hX[b], hXRes[b]); *max_err = err > *max_err ? err : *max_err; } } template void geblttrs_npvt_getPerfData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hXRes, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { // there is no direct CPU/LAPACK equivalent for this function, therefore // we return an invalid CPU time *cpu_time_used = nan(""); } geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); // cold calls for(int iter = 0; iter < 2; iter++) { geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); CHECK_ROCBLAS_ERROR(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geblttrs_npvt_initData(handle, nb, nblocks, nrhs, dA, lda, dB, ldb, dC, ldc, dX, ldx, bc, hA, hB, hC, hX, hXRes); start = get_time_us_sync(stream); rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geblttrs_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int nb = argus.get("nb"); rocblas_int nblocks = argus.get("nblocks"); rocblas_int nrhs = argus.get("nrhs"); rocblas_int lda = argus.get("lda", nb); rocblas_int ldb = argus.get("ldb", nb); rocblas_int ldc = argus.get("ldc", nb); rocblas_int ldx = argus.get("ldx", nb); rocblas_stride stA = argus.get("strideA", lda * nb * nblocks); rocblas_stride stB = argus.get("strideB", ldb * nb * nblocks); rocblas_stride stC = argus.get("strideC", ldc * nb * nblocks); rocblas_stride stX = argus.get("strideX", ldx * nrhs * nblocks); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stXRes = stX; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * nb * nblocks; size_t size_B = size_t(ldb) * nb * nblocks; size_t size_C = size_t(ldc) * nb * nblocks; size_t size_X = size_t(ldx) * nrhs * nblocks; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_XRes = size_X; // check invalid sizes bool invalid_size = (nb < 0 || nblocks < 0 || nrhs < 0 || lda < nb || ldb < nb || ldc < nb || ldx < nb || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (T* const*)nullptr, ldx, stX, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (T*)nullptr, ldx, stX, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geblttrs_npvt( STRIDED, handle, nb, nblocks, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldc, stC, (T* const*)nullptr, ldx, stX, bc)); else CHECK_ALLOC_QUERY(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldc, stC, (T*)nullptr, ldx, stX, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hC(size_C, 1, bc); host_batch_vector hX(size_X, 1, bc); host_batch_vector hXRes(size_XRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dC(size_C, 1, bc); device_batch_vector dX(size_X, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrs_npvt_getError(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &max_error); // collect performance data if(argus.timing) geblttrs_npvt_getPerfData(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hC(size_C, 1, stC, bc); host_strided_batch_vector hX(size_X, 1, stX, bc); host_strided_batch_vector hXRes(size_XRes, 1, stXRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dX(size_X, 1, stX, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt(STRIDED, handle, nb, nblocks, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dC.data(), ldc, stC, dX.data(), ldx, stX, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrs_npvt_getError(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &max_error); // collect performance data if(argus.timing) geblttrs_npvt_getPerfData(handle, nb, nblocks, nrhs, dA, lda, stA, dB, ldb, stB, dC, ldc, stC, dX, ldx, stX, bc, hA, hB, hC, hX, hXRes, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using nb * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("nb", "nblocks", "nrhs", "lda", "ldb", "ldc", "ldx", "batch_c"); rocsolver_bench_output(nb, nblocks, nrhs, lda, ldb, ldc, ldx, bc); } else if(STRIDED) { rocsolver_bench_output("nb", "nblocks", "nrhs", "lda", "strideA", "ldb", "strideB", "ldc", "strideC", "ldx", "strideX", "batch_c"); rocsolver_bench_output(nb, nblocks, nrhs, lda, stA, ldb, stB, ldc, stC, ldx, stX, bc); } else { rocsolver_bench_output("nb", "nblocks", "nrhs", "lda", "ldb", "ldc", "ldx"); rocsolver_bench_output(nb, nblocks, nrhs, lda, ldb, ldc, ldx); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBLTTRS_NPVT(...) \ extern template void testing_geblttrs_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBLTTRS_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrs_npvt_interleaved.cpp000066400000000000000000000033721503202240500301020ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_geblttrs_npvt_interleaved.hpp" #define TESTING_GEBLTTRS_NPVT_INTERLEAVED(...) \ template void testing_geblttrs_npvt_interleaved<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEBLTTRS_NPVT_INTERLEAVED, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geblttrs_npvt_interleaved.hpp000066400000000000000000000702721503202240500301120ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void geblttrs_npvt_interleaved_checkBadArgs(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, T dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, T dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, T dX, const rocblas_int incx, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved(nullptr, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved(handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, (T) nullptr, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, dA, inca, lda, stA, (T) nullptr, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, (T) nullptr, incc, ldc, stC, dX, incx, ldx, stX, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, (T) nullptr, incx, ldx, stX, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved(handle, 0, nblocks, nrhs, (T) nullptr, inca, lda, stA, (T) nullptr, incb, ldb, stB, (T) nullptr, incc, ldc, stC, (T) nullptr, incx, ldx, stX, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved(handle, nb, 0, nrhs, (T) nullptr, inca, lda, stA, (T) nullptr, incb, ldb, stB, (T) nullptr, incc, ldc, stC, (T) nullptr, incx, ldx, stX, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved(handle, nb, nblocks, 0, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, (T) nullptr, incx, ldx, stX, bc), rocblas_status_success); // quick return with zero batch_count if applicable EXPECT_ROCBLAS_STATUS(rocsolver_geblttrs_npvt_interleaved(handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, 0), rocblas_status_success); } template void testing_geblttrs_npvt_interleaved_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int nb = 1; rocblas_int nblocks = 2; rocblas_int nrhs = 1; rocblas_int inca = 1; rocblas_int incb = 1; rocblas_int incc = 1; rocblas_int incx = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldc = 1; rocblas_int ldx = 1; rocblas_stride stA = 2; rocblas_stride stB = 2; rocblas_stride stC = 2; rocblas_stride stX = 2; rocblas_int bc = 1; // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); // check bad arguments geblttrs_npvt_interleaved_checkBadArgs(handle, nb, nblocks, nrhs, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dX.data(), incx, ldx, stX, bc); } template void geblttrs_npvt_interleaved_initData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int incx, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hRHS) { if(CPU) { int info; int n = nb * nblocks; std::vector M(n * n); std::vector XX(n * nrhs); std::vector XB(n * nrhs); std::vector ipiv(nb); // initialize blocks of the original matrix rocblas_init(hA, true); rocblas_init(hB, false); rocblas_init(hC, false); // initialize solution vectors rocblas_init(hX, false); for(rocblas_int b = 0; b < bc; ++b) { T* A = hA[0] + b * stA; T* B = hB[0] + b * stB; T* C = hC[0] + b * stC; T* X = hX[0] + b * stX; T* RHS = hRHS[0] + b * stX; // form original matrix M and scale to avoid singularities for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { if(i == j) M[i + j * n + k * (n + 1) * nb] = B[i * incb + j * ldb + k * ldb * nb] + 400; else M[i + j * n + k * (n + 1) * nb] = B[i * incb + j * ldb + k * ldb * nb] - 4; if(k < nblocks - 1) { M[(i + nb) + j * n + k * (n + 1) * nb] = A[i * inca + j * lda + k * lda * nb] - 4; M[i + (j + nb) * n + k * (n + 1) * nb] = C[i * incc + j * ldc + k * ldc * nb] - 4; } } } } // move blocks of X to full matrix XX for(rocblas_int k = 0; k < nblocks; k++) for(rocblas_int i = 0; i < nb; i++) for(rocblas_int j = 0; j < nrhs; j++) XX[i + j * n + k * nb] = X[i * incx + j * ldx + k * ldx * nrhs]; // generate the full matrix of right-hand-side vectors XB by computing M * XX cpu_gemm(rocblas_operation_none, rocblas_operation_none, n, nrhs, n, T(1), M.data(), n, XX.data(), n, T(0), XB.data(), n); // move XB to block format in hRHS for(rocblas_int k = 0; k < nblocks; k++) for(rocblas_int i = 0; i < nb; i++) for(rocblas_int j = 0; j < nrhs; j++) RHS[i * incx + j * ldx + k * ldx * nrhs] = XB[i + j * n + k * nb]; // factorize M cpu_getrf(nb, nb, M.data(), n, ipiv.data(), &info); for(rocblas_int k = 0; k < nblocks - 1; k++) { cpu_getrs(rocblas_operation_none, nb, nb, M.data() + k * (n + 1) * nb, n, ipiv.data(), M.data() + nb * n + k * (n + 1) * nb, n); cpu_gemm(rocblas_operation_none, rocblas_operation_none, nb, nb, nb, T(-1), M.data() + nb + k * (n + 1) * nb, n, M.data() + nb * n + k * (n + 1) * nb, n, T(1), M.data() + (k + 1) * (n + 1) * nb, n); cpu_getrf(nb, nb, M.data() + (k + 1) * (n + 1) * nb, n, ipiv.data(), &info); } // move factorized blocks from M into hA, hB, and hC for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nb; j++) { B[i * incb + j * ldb + k * ldb * nb] = M[i + j * n + k * (n + 1) * nb]; if(k < nblocks - 1) { A[i * inca + j * lda + k * lda * nb] = M[(i + nb) + j * n + k * (n + 1) * nb]; C[i * incc + j * ldc + k * ldc * nb] = M[i + (j + nb) * n + k * (n + 1) * nb]; } } } } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dC.transfer_from(hC)); CHECK_HIP_ERROR(dX.transfer_from(hRHS)); } } template void geblttrs_npvt_interleaved_getError(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int incx, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hXRes, double* max_err) { std::vector Xtmp(nb * nrhs * nblocks); std::vector XtmpRes(nb * nrhs * nblocks); // input data initialization geblttrs_npvt_interleaved_initData(handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc, hA, hB, hC, hX, hXRes); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dX.data(), incx, ldx, stX, bc)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); double err = 0; *max_err = 0; // error is ||hX - hXRes|| / ||hX|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { // put X and XRes into Xtmp and XtmpRes in column-major format for(rocblas_int k = 0; k < nblocks; k++) { for(rocblas_int i = 0; i < nb; i++) { for(rocblas_int j = 0; j < nrhs; j++) { Xtmp[i + j * nb + k * nb * nrhs] = hX[0][i * incx + j * ldx + k * ldx * nrhs + b * stX]; XtmpRes[i + j * nb + k * nb * nrhs] = hXRes[0][i * incx + j * ldx + k * ldx * nrhs + b * stX]; } } } err = norm_error('F', nb, nrhs * nblocks, nb, Xtmp.data(), XtmpRes.data()); *max_err = err > *max_err ? err : *max_err; } } template void geblttrs_npvt_interleaved_getPerfData(const rocblas_handle handle, const rocblas_int nb, const rocblas_int nblocks, const rocblas_int nrhs, Td& dA, const rocblas_int inca, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int incb, const rocblas_int ldb, const rocblas_stride stB, Td& dC, const rocblas_int incc, const rocblas_int ldc, const rocblas_stride stC, Td& dX, const rocblas_int incx, const rocblas_int ldx, const rocblas_stride stX, const rocblas_int bc, Th& hA, Th& hB, Th& hC, Th& hX, Th& hXRes, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { // there is no direct CPU/LAPACK equivalent for this function, therefore // we return an invalid CPU time *cpu_time_used = nan(""); } geblttrs_npvt_interleaved_initData(handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc, hA, hB, hC, hX, hXRes); // cold calls for(int iter = 0; iter < 2; iter++) { geblttrs_npvt_interleaved_initData( handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc, hA, hB, hC, hX, hXRes); CHECK_ROCBLAS_ERROR(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dX.data(), incx, ldx, stX, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geblttrs_npvt_interleaved_initData( handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc, hA, hB, hC, hX, hXRes); start = get_time_us_sync(stream); rocsolver_geblttrs_npvt_interleaved(handle, nb, nblocks, nrhs, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dX.data(), incx, ldx, stX, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geblttrs_npvt_interleaved(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int nb = argus.get("nb"); rocblas_int nblocks = argus.get("nblocks"); rocblas_int nrhs = argus.get("nrhs"); rocblas_int inca = argus.get("inca", 1); rocblas_int incb = argus.get("incb", 1); rocblas_int incc = argus.get("incc", 1); rocblas_int incx = argus.get("incx", 1); rocblas_int lda = argus.get("lda", nb); rocblas_int ldb = argus.get("ldb", nb); rocblas_int ldc = argus.get("ldc", nb); rocblas_int ldx = argus.get("ldx", nb); rocblas_stride stA = argus.get("strideA", lda * nb * nblocks); rocblas_stride stB = argus.get("strideB", ldb * nb * nblocks); rocblas_stride stC = argus.get("strideC", ldc * nb * nblocks); rocblas_stride stX = argus.get("strideX", ldx * nrhs * nblocks); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stXRes = stX; // check non-supported values // N/A // determine sizes rocblas_int n = nb * nblocks; size_t size_A = std::max(size_t(lda) * n, size_t(stA)) * bc; size_t size_B = std::max(size_t(ldb) * n, size_t(stB)) * bc; size_t size_C = std::max(size_t(ldc) * n, size_t(stC)) * bc; size_t size_X = std::max(size_t(ldx) * nrhs * nblocks, size_t(stX)) * bc; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_XRes = size_X; // check invalid sizes bool invalid_a = (inca < 1 || lda < inca * nb); bool invalid_b = (incc < 1 || ldc < incc * nb); bool invalid_c = (incc < 1 || ldc < incc * nb); bool invalid_x = (incx < 1 || ldx < incx * nb); bool invalid_size = (nb < 0 || nblocks < 0 || nrhs < 0 || bc < 0 || invalid_a || invalid_b || invalid_c || invalid_x); if(invalid_size) { EXPECT_ROCBLAS_STATUS( rocsolver_geblttrs_npvt_interleaved(handle, nb, nblocks, nrhs, (T*)nullptr, inca, lda, stA, (T*)nullptr, incb, ldb, stB, (T*)nullptr, incc, ldc, stC, (T*)nullptr, incx, ldx, stX, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); CHECK_ALLOC_QUERY(rocsolver_geblttrs_npvt_interleaved( handle, nb, nblocks, nrhs, (T*)nullptr, inca, lda, stA, (T*)nullptr, incb, ldb, stB, (T*)nullptr, incc, ldc, stC, (T*)nullptr, incx, ldx, stX, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations host_strided_batch_vector hA(size_A, 1, size_A, 1); host_strided_batch_vector hB(size_B, 1, size_B, 1); host_strided_batch_vector hC(size_C, 1, size_C, 1); host_strided_batch_vector hX(size_X, 1, size_X, 1); host_strided_batch_vector hXRes(size_XRes, 1, size_XRes, 1); device_strided_batch_vector dA(size_A, 1, size_A, 1); device_strided_batch_vector dB(size_B, 1, size_B, 1); device_strided_batch_vector dC(size_C, 1, size_C, 1); device_strided_batch_vector dX(size_X, 1, size_X, 1); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); // check quick return if(nb == 0 || nblocks == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_geblttrs_npvt_interleaved(handle, nb, nblocks, nrhs, dA.data(), inca, lda, stA, dB.data(), incb, ldb, stB, dC.data(), incc, ldc, stC, dX.data(), incx, ldx, stX, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geblttrs_npvt_interleaved_getError(handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc, hA, hB, hC, hX, hXRes, &max_error); // collect performance data if(argus.timing) geblttrs_npvt_interleaved_getPerfData( handle, nb, nblocks, nrhs, dA, inca, lda, stA, dB, incb, ldb, stB, dC, incc, ldc, stC, dX, incx, ldx, stX, bc, hA, hB, hC, hX, hXRes, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); // validate results for rocsolver-test // using nb * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, nb); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); rocsolver_bench_output("nb", "nblocks", "nrhs", "inca", "lda", "strideA", "incb", "ldb", "strideB", "incc", "ldc", "strideC", "incx", "ldx", "strideX", "batch_c"); rocsolver_bench_output(nb, nblocks, nrhs, inca, lda, stA, incb, ldb, stB, incc, ldc, stC, incx, ldx, stX, bc); rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEBLTTRS_NPVT_INTERLEAVED(...) \ extern template void testing_geblttrs_npvt_interleaved<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEBLTTRS_NPVT_INTERLEAVED, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gelq2_gelqf.cpp000066400000000000000000000034411503202240500250100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gelq2_gelqf.hpp" #define TESTING_GELQ2_GELQF(...) template void testing_gelq2_gelqf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GELQ2_GELQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gelq2_gelqf.hpp000066400000000000000000000437711503202240500250270ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gelq2_gelqf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_gelq2_gelqf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gelq2_gelqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gelq2_gelqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void gelq2_gelqf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gelq2_gelqf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(m); // input data initialization gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GELQF ? cpu_gelqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gelq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QR - Qres Rres|| / ||QR||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void gelq2_gelqf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(m); if(!perf) { gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GELQF ? cpu_gelqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gelq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gelq2_gelqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gelq2_gelqf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gelq2_gelqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gelq2_gelqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gelq2_gelqf(STRIDED, GELQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gelq2_gelqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gelq2_gelqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using n * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use n*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GELQ2_GELQF(...) \ extern template void testing_gelq2_gelqf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GELQ2_GELQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gels.cpp000066400000000000000000000032741503202240500235560ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gels.hpp" #define TESTING_GELS(...) template void testing_gels<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GELS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gels.hpp000066400000000000000000000613341503202240500235640ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gels_checkBadArgs(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, U dA, const rocblas_int lda, const rocblas_stride stA, U dB, const rocblas_int ldb, const rocblas_stride stB, rocblas_int* info, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_gels(STRIDED, nullptr, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, rocblas_operation(0), m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, bc), rocblas_status_invalid_value) << "Must report error when operation is invalid"; // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, -1), rocblas_status_invalid_size) << "Must report error when batch size is negative"; // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, info, bc), rocblas_status_invalid_pointer) << "Should normally report error when A is null"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, (U) nullptr, ldb, stB, info, bc), rocblas_status_invalid_pointer) << "Should normally report error when B is null"; EXPECT_ROCBLAS_STATUS( rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, nullptr, bc), rocblas_status_invalid_pointer) << "Should normally report error when info is null"; // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, 0, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, info, bc), rocblas_status_success) << "Matrix A may be null when m is 0 (empty matrix)"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, 0, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, info, bc), rocblas_status_success) << "Matrix A may be null when n is 0 (empty matrix)"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, 0, dA, lda, stA, (U) nullptr, ldb, stB, info, bc), rocblas_status_success) << "Matrix B may be null when nhrs is 0 (empty matrix)"; EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, 0, 0, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, info, bc), rocblas_status_success) << "Matrices A and B may be null when m and n are 0 (empty matrix)"; if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, nullptr, 0), rocblas_status_success) << "Info may be null when batch size is 0"; // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, info, 0), rocblas_status_success); } template void testing_gels_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_int bc = 1; rocblas_operation trans = rocblas_operation_none; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } } template void gels_initData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); const rocblas_int max_index = std::max(0, std::min(m, n) - 1); std::uniform_int_distribution sample_index(0, max_index); std::bernoulli_distribution coinflip(0.5); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // add some singularities // always the same elements for debugging purposes if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { do { if(n <= m) { // zero random col rocblas_int j = sample_index(rocblas_rng); for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; } else { // zero random row rocblas_int i = sample_index(rocblas_rng); for(rocblas_int j = 0; j < n; j++) hA[b][i + j * lda] = 0; } } while(coinflip(rocblas_rng)); } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void gels_getError(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = std::max(1, std::min(m, n) + std::max(std::min(m, n), nrhs)); std::vector hW(sizeW); // input data initialization gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hB[b], ldb, hW.data(), sizeW, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', std::max(m, n), nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void gels_getPerfData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = std::max(1, std::min(m, n) + std::max(std::min(m, n), nrhs)); std::vector hW(sizeW); if(!perf) { gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hB[b], ldb, hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gels_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, singular); start = get_time_us_sync(stream); rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_gels(Arguments& argus) { // get arguments rocblas_local_handle handle; char transC = argus.get("trans"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", m); rocblas_int ldb = argus.get("ldb", std::max(m, n)); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values bool invalid_value = ((COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nrhs < 0 || lda < m || ldb < m || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_getPerfData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_getPerfData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, std::max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "strideA", "strideB", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, stA, stB, bc); } else { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GELS(...) extern template void testing_gels<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GELS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gels_outofplace.hpp000066400000000000000000000745471503202240500260170ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gels_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, U dA, const rocblas_int lda, const rocblas_stride stA, U dB, const rocblas_int ldb, const rocblas_stride stB, U dX, const rocblas_int ldx, const rocblas_stride stX, rocblas_int* info, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, nullptr, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, rocblas_operation(0), m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, (U) nullptr, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, rocblas_operation_none, 0, n, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, 0, n, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, rocblas_operation_none, m, 0, nrhs, (U) nullptr, lda, stA, dB, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, 0, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, dX, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, 0, dA, lda, stA, (U) nullptr, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, 0, 0, nrhs, (U) nullptr, lda, stA, (U) nullptr, ldb, stB, (U) nullptr, ldx, stX, info, bc), rocblas_status_success); if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, info, 0), rocblas_status_success); } template void testing_gels_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldx = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stX = 1; rocblas_int bc = 1; rocblas_operation trans = (!rocblas_is_complex ? rocblas_operation_transpose : rocblas_operation_conjugate_transpose); if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dX(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_outofplace_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gels_outofplace_checkBadArgs(handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } } template void gels_outofplace_initData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hX, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); const rocblas_int max_index = std::max(0, std::min(m, n) - 1); std::uniform_int_distribution sample_index(0, max_index); std::bernoulli_distribution coinflip(0.5); const rocblas_int rowsB = (trans == rocblas_operation_none) ? m : n; const rocblas_int ldx = std::max(m, n); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // populate hX with values from hB for(rocblas_int i = 0; i < rowsB; i++) for(rocblas_int j = 0; j < nrhs; j++) hX[b][i + j * ldx] = hB[b][i + j * ldb]; // add some singularities // always the same elements for debugging purposes if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { do { if(n <= m) { // zero random col rocblas_int j = sample_index(rocblas_rng); for(rocblas_int i = 0; i < m; i++) hA[b][i + j * lda] = 0; } else { // zero random row rocblas_int i = sample_index(rocblas_rng); for(rocblas_int j = 0; j < n; j++) hA[b][i + j * lda] = 0; } } while(coinflip(rocblas_rng)); } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void gels_outofplace_getError(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Th& hX, Th& hXRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = std::max(1, std::min(m, n) + std::max(std::min(m, n), nrhs)); std::vector hW(sizeW); // input data initialization gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hXRes.transfer_from(dX)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hX[b], std::max(m, n), hW.data(), sizeW, hInfo[b]); } // error is ||hX - hXRes|| / ||hX|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { const rocblas_int rowsB = (trans == rocblas_operation_none) ? m : n; err = norm_error('F', rowsB, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; if(hInfo[b][0] == 0) { const rocblas_int rowsX = (trans == rocblas_operation_none) ? n : m; err = norm_error('I', rowsX, nrhs, std::max(m, n), hX[b], hXRes[b], ldx); *max_err = err > *max_err ? err : *max_err; } } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void gels_outofplace_getPerfData(const rocblas_handle handle, const rocblas_operation trans, const rocblas_int m, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hX, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = std::max(1, std::min(m, n) + std::max(std::min(m, n), nrhs)); std::vector hW(sizeW); if(!perf) { gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gels(trans, m, n, nrhs, hA[b], lda, hX[b], std::max(m, n), hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gels_outofplace_initData(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hX, hInfo, singular); start = get_time_us_sync(stream); rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template > void testing_gels_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; char transC = argus.get("trans"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", m); rocblas_int ldb = argus.get("ldb", transC == 'N' ? m : n); rocblas_int ldx = argus.get("ldx", transC == 'N' ? n : m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_stride stX = argus.get("strideX", ldx * nrhs); rocblas_operation trans = char2rocblas_operation(transC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; rocblas_stride stXRes = (argus.unit_check || argus.norm_check) ? stX : 0; // check non-supported values bool invalid_value = ((COMPLEX && trans == rocblas_operation_transpose) || (!COMPLEX && trans == rocblas_operation_conjugate_transpose)); if(invalid_value) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_X = size_t(ldx) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; size_t size_XRes = (argus.unit_check || argus.norm_check) ? size_X : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || nrhs < 0 || lda < m || (trans == rocblas_operation_none && (ldb < m || ldx < n)) || (trans != rocblas_operation_none && (ldb < n || ldx < m)) || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gels_outofplace( STRIDED, handle, trans, m, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gels_outofplace( STRIDED, handle, trans, m, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_batch_vector hX(std::max(m, n) * nrhs, 1, bc); host_batch_vector hXRes(size_XRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dX(size_X, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_outofplace_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hBRes, hX, hXRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_outofplace_getPerfData( handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hX, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hX(std::max(m, n) * nrhs, 1, std::max(m, n) * nrhs, bc); host_strided_batch_vector hXRes(size_XRes, 1, stXRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dX(size_X, 1, stX, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(bc) CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gels_outofplace(STRIDED, handle, trans, m, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gels_outofplace_getError(handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hBRes, hX, hXRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gels_outofplace_getPerfData( handle, trans, m, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hB, hX, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using max(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, std::max(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "ldx", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, ldx, bc); } else if(STRIDED) { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "ldx", "strideA", "strideB", "strideX", "batch_c"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, ldx, stA, stB, stX, bc); } else { rocsolver_bench_output("trans", "m", "n", "nrhs", "lda", "ldb", "ldx"); rocsolver_bench_output(transC, m, n, nrhs, lda, ldb, ldx); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geql2_geqlf.cpp000066400000000000000000000034411503202240500250100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_geql2_geqlf.hpp" #define TESTING_GEQL2_GEQLF(...) template void testing_geql2_geqlf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEQL2_GEQLF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geql2_geqlf.hpp000066400000000000000000000440011503202240500250120ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void geql2_geqlf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_geql2_geqlf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geql2_geqlf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geql2_geqlf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void geql2_geqlf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(m - i == n - j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void geql2_geqlf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(n); // input data initialization geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GEQLF ? cpu_geqlf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geql2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QL - Qres Lres|| / ||QL||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void geql2_geqlf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(n); if(!perf) { geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GEQLF ? cpu_geqlf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geql2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geql2_geqlf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geql2_geqlf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geql2_geqlf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geql2_geqlf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geql2_geqlf(STRIDED, GEQLF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geql2_geqlf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geql2_geqlf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use m*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEQL2_GEQLF(...) \ extern template void testing_geql2_geqlf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEQL2_GEQLF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geqr2_geqrf.cpp000066400000000000000000000034771503202240500250350ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_geqr2_geqrf.hpp" #define TESTING_GEQR2_GEQRF(...) template void testing_geqr2_geqrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GEQR2_GEQRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_geqr2_geqrf.hpp000066400000000000000000000463331503202240500250400ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void geqr2_geqrf_checkBadArgs(const rocblas_handle handle, const I m, const I n, T dA, const I lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const I bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA, lda, stA, dIpiv, stP, (I)-1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, (I)0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, (I)0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA, lda, stA, dIpiv, stP, (I)0), rocblas_status_success); } template void testing_geqr2_geqrf_bad_arg() { // safe arguments rocblas_local_handle handle; I m = 1; I n = 1; I lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; I bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geqr2_geqrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments geqr2_geqrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void geqr2_geqrf_initData(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const I bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(I b = 0; b < bc; ++b) { for(I i = 0; i < m; i++) { for(I j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void geqr2_geqrf_getError(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const I bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(n); // input data initialization geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(I b = 0; b < bc; ++b) { GEQRF ? cpu_geqrf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geqr2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QR - Qres Rres|| / ||QR||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(I b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void geqr2_geqrf_getPerfData(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const I bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(n); if(!perf) { geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(I b = 0; b < bc; ++b) { GEQRF ? cpu_geqrf(m, n, hA[b], lda, hIpiv[b], hW.data(), n) : cpu_geqr2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { geqr2_geqrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_geqr2_geqrf(Arguments& argus) { // get arguments rocblas_local_handle handle; I m = argus.get("m"); I n = argus.get("n", m); I lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); I bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED && STRIDED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geqr2_geqrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geqr2_geqrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hIpiv(size_P, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dIpiv(size_P, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geqr2_geqrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geqr2_geqrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_geqr2_geqrf(STRIDED, GEQRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) geqr2_geqrf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) geqr2_geqrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use m*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GEQR2_GEQRF(...) \ extern template void testing_geqr2_geqrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GEQR2_GEQRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gerq2_gerqf.cpp000066400000000000000000000034411503202240500250240ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gerq2_gerqf.hpp" #define TESTING_GERQ2_GERQF(...) template void testing_gerq2_gerqf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GERQ2_GERQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gerq2_gerqf.hpp000066400000000000000000000437711503202240500250430ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gerq2_gerqf_checkBadArgs(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, nullptr, m, n, dA, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA, lda, stA, dIpiv, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T) nullptr, lda, stA, dIpiv, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA, lda, stA, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, 0, n, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, 0, (T) nullptr, lda, stA, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA, lda, stA, dIpiv, stP, 0), rocblas_status_success); } template void testing_gerq2_gerqf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int m = 1; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gerq2_gerqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments gerq2_gerqf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); } } template void gerq2_gerqf_initData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gerq2_gerqf_getError(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, double* max_err) { std::vector hW(m); // input data initialization gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { GERQF ? cpu_gerqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gerq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } // error is ||hA - hARes|| / ||hA|| (ideally ||QR - Qres Rres|| / ||QR||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void gerq2_gerqf_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, const rocblas_int bc, Th& hA, Uh& hIpiv, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(m); if(!perf) { gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { GERQF ? cpu_gerqf(m, n, hA[b], lda, hIpiv[b], hW.data(), m) : cpu_gerq2(m, n, hA[b], lda, hIpiv[b], hW.data()); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); // cold calls for(int iter = 0; iter < 2; iter++) { gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); CHECK_ROCBLAS_ERROR(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gerq2_gerqf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv); start = get_time_us_sync(stream); rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gerq2_gerqf(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T* const*)nullptr, lda, stA, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, (T*)nullptr, lda, stA, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gerq2_gerqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gerq2_gerqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gerq2_gerqf(STRIDED, GERQF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gerq2_gerqf_getError(handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hARes, hIpiv, &max_error); // collect performance data if(argus.timing) gerq2_gerqf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, bc, hA, hIpiv, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance // (for possibly singular of ill-conditioned matrices we could use m*min(m,n)) if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, m); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GERQ2_GERQF(...) \ extern template void testing_gerq2_gerqf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GERQ2_GERQF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesdd.cpp000066400000000000000000000033001503202240500237000ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gesdd.hpp" #define TESTING_GESDD(...) template void testing_gesdd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESDD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesdd.hpp000066400000000000000000001056511503202240500237210ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/matrix_utils/matrix_utils.hpp" #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesdd_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dS, const rocblas_stride stS, U dU, const rocblas_int ldu, const rocblas_stride stU, U dV, const rocblas_int ldv, const rocblas_stride stV, I dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, rocblas_svect_overwrite, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, rocblas_svect_overwrite, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, (T) nullptr, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, (S) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, 0, n, (T) nullptr, lda, stA, (S) nullptr, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, 0, (T) nullptr, lda, stA, (S) nullptr, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, 0), rocblas_status_success); } template void testing_gesdd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesdd_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesdd_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } } template void gesdd_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, const bool test = true, const bool singular = false) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { if(!singular) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } else { // form a singular matrix consisting of all ones for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { hA[b][i + j * lda] = 1; } } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesdd_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { using HMat = HostMatrix; using BDesc = typename HMat::BlockDescriptor; rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesdd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // If one of `left_svect` or `right_svect` was requested, this will guarantee // that the other is computed as well CHECK_ROCBLAS_ERROR(rocsolver_gesdd(STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesdd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); *max_err = 0; *max_errv = 0; double err; const bool no_singular_vectors = (left_svect == rocblas_svect_none) && (right_svect == rocblas_svect_none); for(rocblas_int b = 0; b < bc; ++b) { // We expect gesdd to converge for all input matrices EXPECT_EQ(hinfoRes[b][0], 0) << "where b = " << b; if(hinfoRes[b][0] != 0) { *max_err += 1; continue; } err = 0.; // Number of singular values (i.e., dimension of S) is always smallest // number between rows and columns of input matrix A rocblas_int dim_S = std::min(m, n); rocblas_int ncols_U = dim_S; rocblas_int nrows_V = dim_S; // Only check singular values if(no_singular_vectors) { // CPU lapack cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // err = ||hS - hSres||_F / ||hS||_F err = norm_error('F', 1, dim_S, 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; } // Check singular vectors and singular values else { // Get input matrix A auto AWrap = HMat::Wrap(A.data() + b * lda * n, lda, n); auto A = (*AWrap).block(BDesc().nrows(m).ncols(n)); // Get computed singular values (convert singular values from type // S to type T, if required) auto svals = *HMat::Convert(hSres[b], dim_S, 1); auto S = HMat::Zeros(dim_S, dim_S); S.diag(svals); // Get computed eigenvectors auto U = (*HMat::Wrap(Ures[b], ldures, ncols_U)).block(BDesc().nrows(m).ncols(ncols_U)); auto Vt = (*HMat::Wrap(Vres[b], ldvres, n)).block(BDesc().nrows(nrows_V).ncols(n)); // Check orthogonality of left singular vectors if they were requested if(left_svect != rocblas_svect_none) { auto UE = adjoint(U) * U - HMat::Eye(ncols_U, ncols_U); err = UE.max_col_norm(); *max_errv = err > *max_errv ? err : *max_errv; } // Check orthogonality of right singular vectors if they were requested if(right_svect != rocblas_svect_none) { auto VE = Vt * adjoint(Vt) - HMat::Eye(nrows_V, nrows_V); err = VE.max_col_norm(); *max_errv = err > *max_errv ? err : *max_errv; } // Check residual error of reconstructed A double a_bound = 1.; if(m >= n) { a_bound = (adjoint(A) * A).norm(); } else // (m < n) { a_bound = (A * adjoint(A)).norm(); } auto AE = A - U * S * Vt; err = AE.norm() / a_bound; *max_err = err > *max_err ? err : *max_err; } } } template void gesdd_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = 5 * std::min(m, n); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesdd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesdd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesdd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesdd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesdd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesdd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", (rightvC == 'A' ? n : std::min(m, n))); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", std::min(m, n)); rocblas_stride stU = argus.get("strideU", (leftvC == 'A' ? ldu * m : ldu * std::min(m, n))); rocblas_stride stV = argus.get("strideV", ldv * n); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if((rightv != rocblas_svect_none && rightv != rocblas_svect_singular && rightv != rocblas_svect_all) || (leftv != rocblas_svect_none && leftv != rocblas_svect_singular && leftv != rocblas_svect_all)) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** Orthogonality and reconstruction errors will be computed explicitly as * part of `gesdd_getError` method, which may require an extra call to * `rocsolver_gesdd` for the cases in which only one of `left_svect` or * `right_svect` is requested. If such extra call is required, initialize * variables `leftvT`, `rightvT`, `ldvT`, `lduT`, `mT`, and `nT` * accordingly. **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = std::min(m, n); mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(std::min(m, n)); size_t size_U = (leftvC == 'A' ? size_t(ldu) * m : size_t(ldu) * std::min(m, n)); size_t size_V = size_t(ldv) * n; if(argus.unit_check || argus.norm_check) { size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_UT = size_t(lduT) * std::min(mT, nT); size_Ures = size_UT; ldures = lduT; } else { size_Ures = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_VT = size_t(ldvT) * nT; size_Vres = size_VT; ldvres = ldvT; } else { size_Vres = size_V; ldvres = ldv; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all && ldv < n) || (rightv == rocblas_svect_singular && ldv < std::min(m, n))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesdd( STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesdd( STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesdd(STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesdd_getError( handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesdd_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesdd(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesdd_getError( handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesdd_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 3 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 3 * std::min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 3 * std::min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stS, ldu, stU, ldv, stV, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, stS, ldu, stU, ldv, stV, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESDD(...) extern template void testing_gesdd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESDD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesv.cpp000066400000000000000000000032741503202240500235700ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gesv.hpp" #define TESTING_GESV(...) template void testing_gesv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesv.hpp000066400000000000000000000556371503202240500236070ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesv_checkBadArgs(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dB, const rocblas_int ldb, const rocblas_stride stB, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, nullptr, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, (T) nullptr, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, (U) nullptr, stP, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, 0, nrhs, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, 0, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, 0), rocblas_status_success); } template void testing_gesv_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_stride stB = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc); } } template void gesv_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero element in the // diagonal of those matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void gesv_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void gesv_getPerfData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); CHECK_ROCBLAS_ERROR(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesv_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); start = get_time_us_sync(stream); rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesv(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_gesv(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gesv(STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_getPerfData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_getPerfData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "strideP", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "strideA", "strideP", "strideB", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, stA, stP, stB, bc); } else { rocsolver_bench_output("n", "nrhs", "lda", "ldb"); rocsolver_bench_output(n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESV(...) extern template void testing_gesv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesv_outofplace.hpp000066400000000000000000000645321503202240500260220ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesv_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dB, const rocblas_int ldb, const rocblas_stride stB, T dX, const rocblas_int ldx, const rocblas_stride stX, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, nullptr, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T) nullptr, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, (U) nullptr, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, dX, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, (T) nullptr, ldx, stX, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, 0, nrhs, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldb, stB, (T) nullptr, ldx, stX, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, 0, dA, lda, stA, dIpiv, stP, (T) nullptr, ldb, stB, (T) nullptr, ldx, stX, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, 0), rocblas_status_success); } template void testing_gesv_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldx = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_stride stB = 1; rocblas_stride stX = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dX(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_outofplace_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dX(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments gesv_outofplace_checkBadArgs(handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); } } template void gesv_outofplace_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero element in the // diagonal of those matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void gesv_outofplace_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dX)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b], ldx); *max_err = err > *max_err ? err : *max_err; } } // also check info for singularities err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void gesv_outofplace_getPerfData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, const rocblas_int ldx, const rocblas_stride stX, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_gesv(n, nrhs, hA[b], lda, hIpiv[b], hB[b], ldb, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); CHECK_ROCBLAS_ERROR(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesv_outofplace_initData(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hB, singular); start = get_time_us_sync(stream); rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesv_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_int ldx = argus.get("ldx", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_stride stX = argus.get("strideX", ldx * nrhs); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stX : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_X = size_t(ldx) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_X : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || ldx < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY( rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldb, stB, (T* const*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_gesv_outofplace( STRIDED, handle, n, nrhs, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldb, stB, (T*)nullptr, ldx, stX, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dX(size_X, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_outofplace_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_outofplace_getPerfData( handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dX(size_X, 1, stX, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_X) CHECK_HIP_ERROR(dX.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesv_outofplace(STRIDED, handle, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, dX.data(), ldx, stX, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) gesv_outofplace_getError(handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) gesv_outofplace_getPerfData( handle, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, dX, ldx, stX, dInfo, bc, hA, hIpiv, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "ldx", "strideP", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, ldx, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "ldx", "strideA", "strideP", "strideB", "strideX", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, ldx, stA, stP, stB, stX, bc); } else { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "ldx"); rocsolver_bench_output(n, nrhs, lda, ldb, ldx); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvd.cpp000066400000000000000000000033001503202240500237220ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gesvd.hpp" #define TESTING_GESVD(...) template void testing_gesvd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvd.hpp000066400000000000000000001150651503202240500237430ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesvd_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, W dA, const rocblas_int lda, const rocblas_stride stA, TT dS, const rocblas_stride stS, T dU, const rocblas_int ldu, const rocblas_stride stU, T dV, const rocblas_int ldv, const rocblas_stride stV, TT dE, const rocblas_stride stE, const rocblas_workmode fa, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, rocblas_svect(0), right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, rocblas_svect(0), m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, rocblas_svect_overwrite, rocblas_svect_overwrite, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, (W) nullptr, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, (TT) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, (T) nullptr, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, (T) nullptr, ldv, stV, dE, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, (TT) nullptr, stE, fa, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, 0, n, (W) nullptr, lda, stA, (TT) nullptr, stS, (T) nullptr, ldu, stU, dV, ldv, stV, (TT) nullptr, stE, fa, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, 0, (W) nullptr, lda, stA, (TT) nullptr, stS, dU, ldu, stU, (T) nullptr, ldv, stV, (TT) nullptr, stE, fa, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, (U) nullptr, 0), rocblas_status_success); } template void testing_gesvd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_all; rocblas_svect right_svect = rocblas_svect_all; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_stride stE = 2; rocblas_int bc = 1; rocblas_workmode fa = rocblas_outofplace; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvd_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvd_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc); } } template void gesvd_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, const bool test, const bool singular) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { if(!singular) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } else { // form a singular matrix consisting of all ones for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { hA[b][i + j * lda] = 1; } } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvd_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Td& dE, const rocblas_stride stE, const rocblas_workmode fa, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv, const bool singular) { using W = decltype(std::real(T{})); rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, true, singular); // execute computations: // complementary execution to compute all singular vectors if needed (always in-place to ensure // we don't combine results computed by gemm_batched with results computed by gemm_strided_batched) CHECK_ROCBLAS_ERROR(rocsolver_gesvd(STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dE.data(), stE, rocblas_inplace, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, true, singular); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); if(left_svect == rocblas_svect_overwrite) { CHECK_HIP_ERROR(hA.transfer_from(dA)); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < std::min(m, n); j++) Ures[b][i + j * ldures] = hA[b][i + j * lda]; } } } if(right_svect == rocblas_svect_overwrite) { CHECK_HIP_ERROR(hA.transfer_from(dA)); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < std::min(m, n); i++) { for(rocblas_int j = 0; j < n; j++) Vres[b][i + j * ldvres] = hA[b][i + j * lda]; } } } // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err; *max_errv = 0; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, std::min(m, n), 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < std::min(m, n); ++k) { for(rocblas_int i = 0; i < m; ++i) { T tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * sconj(Vres[b][k + j * ldvres]); tmp -= hSres[b][k] * Ures[b][i + k * ldures]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvd_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Td& dE, const rocblas_stride stE, const rocblas_workmode fa, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { using W = decltype(std::real(T{})); rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_gesvd( STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvd_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, false, singular); start = get_time_us_sync(stream); rocsolver_gesvd(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", (rightvC == 'A' ? n : std::min(m, n))); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", std::min(m, n)); rocblas_stride stU = argus.get("strideU", ldu * m); rocblas_stride stV = argus.get("strideV", ldv * n); rocblas_stride stE = argus.get("strideE", std::min(m, n) - 1); char faC = argus.get("fast_alg"); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_workmode fa = char2rocblas_workmode(faC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; if(argus.alg_mode == 1) { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid), rocblas_status_success); rocsolver_alg_mode alg_mode; EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_gesvd, &alg_mode), rocblas_status_success); EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); } // check non-supported values if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_all; lduT = m; mT = m; nT = n; if((n > m && fa == rocblas_outofplace) || (n > m && rightv == rocblas_svect_overwrite)) rightvT = rocblas_svect_overwrite; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_all; ldvT = n; mT = m; nT = n; if((m >= n && fa == rocblas_outofplace) || (m >= n && leftv == rocblas_svect_overwrite)) leftvT = rocblas_svect_overwrite; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(std::min(m, n)); size_t size_E = size_t(std::min(m, n) - 1); size_t size_V = size_t(ldv) * n; size_t size_U = size_t(ldu) * m; if(argus.unit_check || argus.norm_check) { size_VT = size_t(ldvT) * nT; size_UT = size_t(lduT) * mT; size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_Ures = size_UT; ldures = lduT; } else if(leftv == rocblas_svect_singular || leftv == rocblas_svect_all) { size_Ures = size_U; ldures = ldu; } else { size_Ures = m * m; ldures = m; } if(rightv == rocblas_svect_none) { size_Vres = size_VT; ldvres = ldvT; } else if(rightv == rocblas_svect_singular || rightv == rocblas_svect_all) { size_Vres = size_V; ldvres = ldv; } else { size_Vres = n * n; ldvres = n; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all && ldv < n) || (rightv == rocblas_svect_singular && ldv < std::min(m, n))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvd(STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (S*)nullptr, stE, fa, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvd_getError(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv, argus.singular); } // collect performance data if(argus.timing) { gesvd_getPerfData( handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvd(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dE.data(), stE, fa, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvd_getError(handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv, argus.singular); } // collect performance data if(argus.timing) { gesvd_getPerfData( handle, leftv, rightv, m, n, dA, lda, stA, dS, stS, dU, ldu, stU, dV, ldv, stV, dE, stE, fa, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * std::min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * std::min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideS", "ldu", "strideU", "ldv", "strideV", "strideE", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stS, ldu, stU, ldv, stV, stE, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "strideS", "ldu", "strideU", "ldv", "strideV", "strideE", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, stS, ldu, stU, ldv, stV, stE, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESVD(...) extern template void testing_gesvd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvdj.cpp000066400000000000000000000033041503202240500241000ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gesvdj.hpp" #define TESTING_GESVDJ(...) template void testing_gesvdj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvdj.hpp000066400000000000000000001155511503202240500241150ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesvdj_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, S dResidual, const rocblas_int max_sweeps, I dSweeps, S dS, const rocblas_stride stS, U dU, const rocblas_int ldu, const rocblas_stride stU, U dV, const rocblas_int ldv, const rocblas_stride stV, I dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, rocblas_svect_overwrite, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, rocblas_svect_overwrite, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, 0, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, 0, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, 0), rocblas_status_success); } template void testing_gesvdj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_int bc = 1; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } } template void gesvdj_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdj_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hResidualRes, Ih& hSweepsRes, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed CHECK_ROCBLAS_ERROR(rocsolver_gesvdj(STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) { EXPECT_GE(hResidualRes[b][0], 0) << "where b = " << b; if(hResidualRes[b][0] < 0) *max_err += 1; } // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) { EXPECT_GE(hResidualRes[b][0], 0) << "where b = " << b; EXPECT_LE(hSweepsRes[b][0], max_sweeps) << "where b = " << b; if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err; *max_errv = 0; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, std::min(m, n), 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < std::min(m, n); ++k) { for(rocblas_int i = 0; i < m; ++i) { T tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * sconj(Vres[b][k + j * ldvres]); tmp -= hSres[b][k] * Ures[b][i + k * ldures]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdj_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = 5 * std::min(m, n); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdj_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdj(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", (rightvC == 'A' ? n : std::min(m, n))); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", std::min(m, n)); rocblas_stride stU = argus.get("strideU", (leftvC == 'A' ? ldu * m : ldu * std::min(m, n))); rocblas_stride stV = argus.get("strideV", ldv * n); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if((rightv != rocblas_svect_none && rightv != rocblas_svect_singular && rightv != rocblas_svect_all) || (leftv != rocblas_svect_none && leftv != rocblas_svect_singular && leftv != rocblas_svect_all)) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = std::min(m, n); mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(std::min(m, n)); size_t size_U = (leftvC == 'A' ? size_t(ldu) * m : size_t(ldu) * std::min(m, n)); size_t size_V = size_t(ldv) * n; if(argus.unit_check || argus.norm_check) { size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_UT = size_t(lduT) * std::min(mT, nT); size_Ures = size_UT; ldures = lduT; } else { size_Ures = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_VT = size_t(ldvT) * nT; size_Vres = size_VT; ldvres = ldvT; } else { size_Vres = size_V; ldvres = ldv; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all && ldv < n) || (rightv == rocblas_svect_singular && ldv < std::min(m, n))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj( STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_getPerfData(handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * std::min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * std::min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESVDJ(...) extern template void testing_gesvdj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvdj_notransv.hpp000066400000000000000000001207151503202240500260450ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesvdj_notransv_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, S dResidual, const rocblas_int max_sweeps, I dSweeps, S dS, const rocblas_stride stS, U dU, const rocblas_int ldu, const rocblas_stride stU, U dV, const rocblas_int ldv, const rocblas_stride stV, I dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, nullptr, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, rocblas_svect_overwrite, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, rocblas_svect_overwrite, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, 0, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, (U) nullptr, ldu, stU, dV, ldv, stV, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, 0, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stS, dU, ldu, stU, (U) nullptr, ldv, stV, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, (I) nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, (I) nullptr, 0), rocblas_status_success); } template void testing_gesvdj_notransv_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_int bc = 1; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_notransv_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments gesvdj_notransv_checkBadArgs(handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); } } template void gesvdj_notransv_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdj_notransv_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Ud& dUT, const rocblas_int lduT, const rocblas_stride stUT, Ud& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Th& hResidualRes, Ih& hSweepsRes, Th& hS, Th& hSres, Uh& hU, Uh& Ures, const rocblas_int ldures, Uh& hV, Uh& Vres, const rocblas_int ldvres, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed CHECK_ROCBLAS_ERROR(rocsolver_gesvdj_notransv( STRIDED, handle, left_svectT, right_svectT, mT, nT, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(Ures.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(Vres.transfer_from(dVT)); gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdj_notransv( STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular || left_svect == rocblas_svect_all) CHECK_HIP_ERROR(Ures.transfer_from(dU)); if(right_svect == rocblas_svect_singular || right_svect == rocblas_svect_all) CHECK_HIP_ERROR(Vres.transfer_from(dV)); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) { EXPECT_GE(hResidualRes[b][0], 0) << "where b = " << b; if(hResidualRes[b][0] < 0) *max_err += 1; } // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) { EXPECT_GE(hResidualRes[b][0], 0) << "where b = " << b; EXPECT_LE(hSweepsRes[b][0], max_sweeps) << "where b = " << b; if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err; *max_errv = 0; for(rocblas_int b = 0; b < bc; ++b) { // error is ||hS - hSres|| err = norm_error('F', 1, std::min(m, n), 1, hS[b], hSres[b]); *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < std::min(m, n); ++k) { for(rocblas_int i = 0; i < m; ++i) { T tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * Vres[b][j + k * ldvres]; tmp -= hSres[b][k] * Ures[b][i + k * ldures]; err += std::abs(tmp) * std::abs(tmp); } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdj_notransv_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, Td& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Td& dS, const rocblas_stride stS, Ud& dU, const rocblas_int ldu, const rocblas_stride stU, Ud& dV, const rocblas_int ldv, const rocblas_stride stV, Id& dinfo, const rocblas_int bc, Wh& hA, Th& hS, Uh& hU, Uh& hV, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = 5 * std::min(m, n); std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_gesvd(left_svect, right_svect, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdj_notransv( STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdj_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdj_notransv(STRIDED, handle, left_svect, right_svect, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdj_notransv(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", std::min(m, n)); rocblas_stride stU = argus.get("strideU", (leftvC == 'A' ? ldu * m : ldu * std::min(m, n))); rocblas_stride stV = argus.get("strideV", (rightvC == 'A' ? ldv * n : ldv * std::min(m, n))); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if((rightv != rocblas_svect_none && rightv != rocblas_svect_singular && rightv != rocblas_svect_all) || (leftv != rocblas_svect_none && leftv != rocblas_svect_singular && leftv != rocblas_svect_all)) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = n; mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_Sres = 0; size_t size_Ures = 0; size_t size_Vres = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(std::min(m, n)); size_t size_U = (leftvC == 'A' ? size_t(ldu) * m : size_t(ldu) * std::min(m, n)); size_t size_V = (rightvC == 'A' ? size_t(ldv) * n : size_t(ldv) * std::min(m, n)); if(argus.unit_check || argus.norm_check) { size_Sres = size_S; if(svects) { if(leftv == rocblas_svect_none) { size_UT = size_t(lduT) * std::min(mT, nT); size_Ures = size_UT; ldures = lduT; } else { size_Ures = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_VT = size_t(ldvT) * std::min(mT, nT); size_Vres = size_VT; ldvres = ldvT; } else { size_Vres = size_V; ldvres = ldv; } } } rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_Ures; rocblas_stride stVres = size_Vres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || ((leftv == rocblas_svect_all || leftv == rocblas_svect_singular) && ldu < m) || ((rightv == rocblas_svect_all || rightv == rocblas_svect_singular) && ldv < n); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftv, rightv, m, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftvT, rightvT, mT, nT, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftv, rightv, m, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdj_notransv( STRIDED, handle, leftvT, rightvT, mT, nT, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hS(size_S, 1, stS, bc); host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_Sres, 1, stS, bc); host_strided_batch_vector Vres(size_Vres, 1, stVres, bc); host_strided_batch_vector Ures(size_Ures, 1, stUres, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_notransv_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_notransv_getPerfData( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_gesvdj_notransv(STRIDED, handle, leftv, rightv, m, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdj_notransv_getError( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hResidualRes, hSweepsRes, hS, hSres, hU, Ures, ldures, hV, Vres, ldvres, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdj_notransv_getPerfData( handle, leftv, rightv, m, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dS, stS, dU, ldu, stU, dV, ldv, stV, dinfo, bc, hA, hS, hU, hV, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * std::min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 2 * std::min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "strideA", "abstol", "max_sweeps", "strideS", "ldu", "strideU", "ldv", "strideV", "batch_c"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, stA, abstol, max_sweeps, stS, ldu, stU, ldv, stV, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "m", "n", "lda", "abstol", "max_sweeps", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, m, n, lda, abstol, max_sweeps, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvdx.cpp000066400000000000000000000033041503202240500241160ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_gesvdx.hpp" #define TESTING_GESVDX(...) template void testing_gesvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GESVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvdx.hpp000066400000000000000000001332341503202240500241310ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesvdx_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, W dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, rocblas_int* dNsv, S* dS, const rocblas_stride stS, T dU, const rocblas_int ldu, const rocblas_stride stU, T dV, const rocblas_int ldv, const rocblas_stride stV, rocblas_int* difail, const rocblas_stride stF, rocblas_int* dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, nullptr, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, rocblas_svect_all, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, rocblas_svect_all, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, rocblas_srange(0), m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, (T) nullptr, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, (T) nullptr, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, 0, n, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, (T) nullptr, ldu, stU, dV, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, 0, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, (T) nullptr, ldu, stU, (T) nullptr, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_gesvdx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_srange srange = rocblas_srange_all; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_stride stF = 2; rocblas_int bc = 1; S vl = 0; S vu = 0; rocblas_int il = 0; rocblas_int iu = 0; // memory allocations (all cases) device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector difail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(difail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments gesvdx_checkBadArgs(handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments gesvdx_checkBadArgs(handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); } } template void gesvdx_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); rocblas_int nn = std::min(m, n); // construct non singular matrix A such that all singular values are in (0, 20] for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { if(i == nn / 4 || i == nn / 2 || i == nn - 1 || i == nn / 7 || i == nn / 5 || i == nn / 3) hA[b][i + i * lda] = 0; for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = 2 * std::real(hA[b][i + j * lda]) - 21; else { if(m >= n) { if(j == i + 1) hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; else hA[b][i + j * lda] = 0; } else { if(i == j + 1) hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; else hA[b][i + j * lda] = 0; } } } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdx_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNsv, Ud& dS, const rocblas_stride stS, Td& dU, const rocblas_int ldu, const rocblas_stride stU, Td& dV, const rocblas_int ldv, const rocblas_stride stV, Id& difail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Td& dUT, const rocblas_int lduT, const rocblas_stride stUT, Td& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Ih& hNsv, Ih& hNsvRes, Uh& hS, Uh& hSres, Th& hU, Th& hUres, const rocblas_int ldures, Th& hV, Th& hVres, const rocblas_int ldvres, Ih& hifail, Ih& hifailRes, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { /** As per lapack's documentation, the following workspace size should work: rocblas_int minn = std::min(m,n); rocblas_int maxn = std::max(m,n); rocblas_int lwork = minn * minn + 6 * minn + maxn; rocblas_int lrwork = 17 * minn * minn; std::vector work(lwork); std::vector rwork(lrwork); HOWEVER, gesvdx_ GIVES ILLEGAL VALUE FOR ARGUMENT lwork. Making the memory query to get the correct workspace dimension: std::vector query(1); cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[0], lda, vl, vu, il, iu, hNsv[0], hS[0], hU[0], ldu, hV[0], ldv, query.data(), -1, rwork.data(), hifail[0], hinfo[0]); rocblas_int lwork = int(std::real(query[0])); std::vector work(lwork); AND NOW gesvdx_ FAILS WITH seg fault ERROR. **/ // (TODO: Need to confirm problem with gesvdx_ and report it) /** WORKAROUND: for now, we will call gesvd_ to get all the singular values on the CPU side and offset the result array according to srange, vl, vu, il, and iu. This approach has 2 disadvantages: 1. singular values are not computed to the same accuracy by gesvd_ (QR iteration) and gesvdx_ (inverse iteration). So, comparison maybe more sensitive. 2. info and ifail cannot be tested as they have different meaning in gesvd_ 3. we cannot provide timing for CPU execution using gesvd_ when testing gesvdx_ **/ // (TODO: We may revisit the entire approach in the future: change to another solution, // or wait for problems with gesvdx_ to be fixed) std::vector offset(bc); rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); rocblas_int minn = std::min(m, n); // input data initialization std::vector A(lda * n * bc); gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed if(mT * nT > 0) { CHECK_ROCBLAS_ERROR(rocsolver_gesvdx(STRIDED, handle, left_svectT, right_svectT, srange, mT, nT, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, difail.data(), stF, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(hUres.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(hVres.transfer_from(dVT)); } gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { //cpu_gesvdx(rocblas_svect_none, rocblas_svect_none, srange, m, n, hA[b], lda, vl, vu, il, iu, hNsv[b], hS[b], hU[b], ldu, hV[b], ldv, // work.data(), lwork, rwork.data(), hifail[b], hinfo[b]); /*** WORKAROUND: ***/ cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); hNsv[b][0] = 0; offset[b] = -1; if(srange == rocblas_srange_index) { offset[b] = il - 1; hNsv[b][0] = iu - il + 1; } else if(srange == rocblas_srange_value) { for(int j = 0; j < minn; ++j) { if(hS[b][j] < vu && hS[b][j] >= vl) { if(offset[b] == -1) offset[b] = j; hNsv[b][0]++; } } } else { offset[b] = 0; hNsv[b][0] = minn; } /*******************/ } // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hNsvRes.transfer_from(dNsv)); CHECK_HIP_ERROR(hifailRes.transfer_from(difail)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular) CHECK_HIP_ERROR(hUres.transfer_from(dU)); if(right_svect == rocblas_svect_singular) CHECK_HIP_ERROR(hVres.transfer_from(dV)); *max_err = 0; *max_errv = 0; // Check info and ifail for non-convergence // (NOTE: With the workaround in place, info and ifail cannot be tested as they have different // meaning in gesvd_, however, We expect the used input matrices to always converge) /*for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; for(int j = 0; j < hNsv[b][0]; ++j) { EXPECT_EQ(hifail[b][j], hifailRes[b][j]) << "where b = " << b << ", j = " << j; if(hifail[b][j] != hifailRes[b][j]) *max_err += 1; } }*/ double err = 0; for(rocblas_int b = 0; b < bc; ++b) { // check number of computed singular values rocblas_int nn = hNsvRes[b][0]; *max_err += std::abs(nn - hNsv[b][0]); EXPECT_EQ(hNsv[b][0], hNsvRes[b][0]) << "where b = " << b; // error is ||hS - hSres|| err = norm_error('F', 1, hNsv[b][0], 1, hS[b] + offset[b], hSres[b]); //WORKAROUND *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { // U and V should be orthonormal, if they are then U^T*U and V*V^T should be the identity if(nn > 0) { std::vector UUres(nn * nn, 0.0); std::vector VVres(nn * nn, 0.0); std::vector I(nn * nn, 0.0); for(rocblas_int i = 0; i < nn; i++) I[i + i * nn] = T(1); cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, m, T(1), hUres[b], ldures, hUres[b], ldures, T(0), UUres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), UUres.data()); *max_errv = err > *max_errv ? err : *max_errv; cpu_gemm(rocblas_operation_none, rocblas_operation_conjugate_transpose, nn, nn, n, T(1), hVres[b], ldvres, hVres[b], ldvres, T(0), VVres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), VVres.data()); *max_errv = err > *max_errv ? err : *max_errv; } err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < hNsv[b][0]; ++k) { T tmp = 0; double tmp2 = 0; // (Comparing absolute values to deal with the fact that the pair of singular vectors (u,-v) or (-u,v) are // both ok and we could get either one with the complementary or main executions when only // one side set of vectors is required. May be revisited in the future.) for(rocblas_int i = 0; i < m; ++i) { tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * sconj(hVres[b][k + j * ldvres]); tmp2 = std::abs(tmp) - std::abs(hSres[b][k] * hUres[b][i + k * ldures]); err += tmp2 * tmp2; } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdx_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNsv, Ud& dS, const rocblas_stride stS, Td& dU, const rocblas_int ldu, const rocblas_stride stU, Td& dV, const rocblas_int ldv, const rocblas_stride stV, Id& difail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Wh& hA, Ih& hNsv, Uh& hS, Th& hU, Th& hV, Ih& hifail, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { /** As per lapack's documentation, the following workspace size should work: rocblas_int minn = std::min(m,n); rocblas_int maxn = std::max(m,n); rocblas_int lwork = minn * minn + 6 * minn + maxn; rocblas_int lrwork = 17 * minn * minn; std::vector work(lwork); std::vector rwork(lrwork); HOWEVER, gesvdx_ GIVES ILLEGAL VALUE FOR ARGUMENT lwork. Making the memory query to get the correct workspace dimension: std::vector query(1); cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[0], lda, vl, vu, il, iu, hNsv[0], hS[0], hU[0], ldu, hV[0], ldv, query.data(), -1, rwork.data(), hifail[0], hinfo[0]); rocblas_int lwork = int(std::real(query[0])); std::vector work(lwork); AND NOW gesvdx_ FAILS WITH seg fault ERROR. **/ // (TODO: Need to confirm problem with gesvdx_ and report it) // For now we cannot report cpu time std::vector A(lda * n * bc); if(!perf) { //gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) //*cpu_time_used = get_time_us_no_sync(); //for(rocblas_int b = 0; b < bc; ++b) // cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[b], lda, vl, vu, il, iu, hNsv[b], hS[b], hU[b], ldu, hV[b], ldv, // work.data(), lwork, rwork.data(), hifail[b], hinfo[b]); //*cpu_time_used = get_time_us_no_sync() - *cpu_time_used; *cpu_time_used = nan(""); } gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdx_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdx(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); char srangeC = argus.get("srange"); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_srange srange = char2rocblas_srange(srangeC); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", srangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", srangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", srangeC == 'I' ? 1 : 0); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nn = std::min(m, n); rocblas_int nsv_max = (srange == rocblas_srange_index ? iu - il + 1 : nn); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", nsv_max); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", nsv_max); rocblas_stride stF = argus.get("strideF", nn); rocblas_stride stU = argus.get("strideU", ldu * nsv_max); rocblas_stride stV = argus.get("strideV", ldv * n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(rightv == rocblas_svect_overwrite || leftv == rocblas_svect_overwrite || rightv == rocblas_svect_all || leftv == rocblas_svect_all) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = nsv_max; mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_hSres = 0; size_t size_hUres = 0; size_t size_hVres = 0; size_t size_hifailRes = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(nsv_max); size_t size_S_cpu = size_t(nn); size_t size_V = size_t(ldv) * n; size_t size_U = size_t(ldu) * nsv_max; size_t size_ifail = nn; if(argus.unit_check || argus.norm_check) { size_hifailRes = nn; size_VT = size_t(ldvT) * n; size_UT = size_t(lduT) * nsv_max; size_hSres = nsv_max; if(svects) { if(leftv == rocblas_svect_none) { size_hUres = size_UT; ldures = lduT; } else { size_hUres = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_hVres = size_VT; ldvres = ldvT; } else { size_hVres = size_V; ldvres = ldv; } } } rocblas_stride stS_cpu = size_S_cpu; rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_hUres; rocblas_stride stVres = size_hVres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || (leftv == rocblas_svect_singular && ldu < m) || (rightv == rocblas_svect_singular && ldv < nsv_max) || (srange == rocblas_srange_value && (vl < 0 || vl >= vu)) || (srange == rocblas_srange_index && (il < 1 || iu < 0)) || (srange == rocblas_srange_index && (iu > nn || (nn > 0 && il > iu))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftvT, rightvT, srange, mT, nT, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdx( STRIDED, handle, leftvT, rightvT, srange, mT, nT, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hS(size_S_cpu, 1, stS_cpu, bc); // extra space for cpu_gesvd call host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hNsv(1, 1, 1, bc); host_strided_batch_vector hNsvRes(1, 1, 1, bc); host_strided_batch_vector hifail(12 * nn, 1, stF, bc); host_strided_batch_vector hifailRes(size_hifailRes, 1, stF, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_hSres, 1, stS, bc); host_strided_batch_vector hVres(size_hVres, 1, stVres, bc); host_strided_batch_vector hUres(size_hUres, 1, stUres, bc); // device device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dNsv(1, 1, 1, bc); device_strided_batch_vector difail(size_ifail, 1, stF, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); if(size_ifail) CHECK_HIP_ERROR(difail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdx_getError( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hNsv, hNsvRes, hS, hSres, hU, hUres, ldures, hV, hVres, ldvres, hifail, hifailRes, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdx_getPerfData(handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, hA, hNsv, hS, hU, hV, hifail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx(STRIDED, handle, leftv, rightv, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdx_getError( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hNsv, hNsvRes, hS, hSres, hU, hUres, ldures, hV, hVres, ldvres, hifail, hifailRes, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdx_getPerfData(handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, hA, hNsv, hS, hU, hV, hifail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * std::min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 4 * std::min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "vl", "vu", "il", "iu", "strideS", "ldu", "strideU", "ldv", "strideV", "strideF", "batch_c"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, vl, vu, il, iu, stS, ldu, stU, ldv, stV, stF, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "strideA", "vl", "vu", "il", "iu", "strideS", "ldu", "strideU", "ldv", "strideV", "strideF", "batch_c"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, stA, vl, vu, il, iu, stS, ldu, stU, ldv, stV, stF, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "vl", "vu", "il", "iu", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, vl, vu, il, iu, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GESVDX(...) extern template void testing_gesvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GESVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_gesvdx_notransv.hpp000066400000000000000000001373671503202240500260760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void gesvdx_notransv_checkBadArgs(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, W dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, rocblas_int* dNsv, S* dS, const rocblas_stride stS, T dU, const rocblas_int ldu, const rocblas_stride stU, T dV, const rocblas_int ldv, const rocblas_stride stV, rocblas_int* difail, const rocblas_stride stF, rocblas_int* dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, nullptr, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, rocblas_svect_all, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, rocblas_svect_all, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, rocblas_srange(0), m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, (T) nullptr, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, (T) nullptr, ldv, stV, difail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv( STRIDED, handle, left_svect, right_svect, srange, 0, n, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, (T) nullptr, ldu, stU, dV, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv( STRIDED, handle, left_svect, right_svect, srange, m, 0, (W) nullptr, lda, stA, vl, vu, il, iu, dNsv, (S*)nullptr, stS, (T) nullptr, ldu, stU, (T) nullptr, ldv, stV, (rocblas_int*)nullptr, stF, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_gesvdx_notransv_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_svect left_svect = rocblas_svect_singular; rocblas_svect right_svect = rocblas_svect_singular; rocblas_srange srange = rocblas_srange_all; rocblas_int m = 2; rocblas_int n = 2; rocblas_int lda = 2; rocblas_int ldu = 2; rocblas_int ldv = 2; rocblas_stride stA = 2; rocblas_stride stS = 2; rocblas_stride stU = 2; rocblas_stride stV = 2; rocblas_stride stF = 2; rocblas_int bc = 1; S vl = 0; S vu = 0; rocblas_int il = 0; rocblas_int iu = 0; // memory allocations (all cases) device_strided_batch_vector dS(1, 1, 1, 1); device_strided_batch_vector dU(1, 1, 1, 1); device_strided_batch_vector dV(1, 1, 1, 1); device_strided_batch_vector dNsv(1, 1, 1, 1); device_strided_batch_vector difail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dS.memcheck()); CHECK_HIP_ERROR(dU.memcheck()); CHECK_HIP_ERROR(dV.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); CHECK_HIP_ERROR(difail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments gesvdx_notransv_checkBadArgs(handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); // check bad arguments gesvdx_notransv_checkBadArgs(handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv, dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); } } template void gesvdx_notransv_initData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); rocblas_int nn = std::min(m, n); // construct non singular matrix A such that all singular values are in (0, 20] for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < m; i++) { if(i == nn / 4 || i == nn / 2 || i == nn - 1 || i == nn / 7 || i == nn / 5 || i == nn / 3) hA[b][i + i * lda] = 0; for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = 2 * std::real(hA[b][i + j * lda]) - 21; else { if(m >= n) { if(j == i + 1) hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; else hA[b][i + j * lda] = 0; } else { if(i == j + 1) hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; else hA[b][i + j * lda] = 0; } } } } // make copy of original data to test vectors if required if(test && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { for(rocblas_int i = 0; i < m; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void gesvdx_notransv_getError(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNsv, Ud& dS, const rocblas_stride stS, Td& dU, const rocblas_int ldu, const rocblas_stride stU, Td& dV, const rocblas_int ldv, const rocblas_stride stV, Id& difail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, const rocblas_svect left_svectT, const rocblas_svect right_svectT, const rocblas_int mT, const rocblas_int nT, Td& dUT, const rocblas_int lduT, const rocblas_stride stUT, Td& dVT, const rocblas_int ldvT, const rocblas_stride stVT, Wh& hA, Ih& hNsv, Ih& hNsvRes, Uh& hS, Uh& hSres, Th& hU, Th& hUres, const rocblas_int ldures, Th& hV, Th& hVres, const rocblas_int ldvres, Ih& hifail, Ih& hifailRes, Ih& hinfo, Ih& hinfoRes, double* max_err, double* max_errv) { /** As per lapack's documentation, the following workspace size should work: rocblas_int minn = std::min(m,n); rocblas_int maxn = std::max(m,n); rocblas_int lwork = minn * minn + 6 * minn + maxn; rocblas_int lrwork = 17 * minn * minn; std::vector work(lwork); std::vector rwork(lrwork); HOWEVER, gesvdx_ GIVES ILLEGAL VALUE FOR ARGUMENT lwork. Making the memory query to get the correct workspace dimension: std::vector query(1); cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[0], lda, vl, vu, il, iu, hNsv[0], hS[0], hU[0], ldu, hV[0], ldv, query.data(), -1, rwork.data(), hifail[0], hinfo[0]); rocblas_int lwork = int(std::real(query[0])); std::vector work(lwork); AND NOW gesvdx_ FAILS WITH seg fault ERROR. **/ // (TODO: Need to confirm problem with gesvdx_ and report it) /** WORKAROUND: for now, we will call gesvd_ to get all the singular values on the CPU side and offset the result array according to srange, vl, vu, il, and iu. This approach has 2 disadvantages: 1. singular values are not computed to the same accuracy by gesvd_ (QR iteration) and gesvdx_ (inverse iteration). So, comparison maybe more sensitive. 2. info and ifail cannot be tested as they have different meaning in gesvd_ 3. we cannot provide timing for CPU execution using gesvd_ when testing gesvdx_ **/ // (TODO: We may revisit the entire approach in the future: change to another solution, // or wait for problems with gesvdx_ to be fixed) std::vector offset(bc); rocblas_int lwork = 5 * std::max(m, n); rocblas_int lrwork = (rocblas_is_complex ? 5 * std::min(m, n) : 0); std::vector work(lwork); std::vector rwork(lrwork); rocblas_int minn = std::min(m, n); // input data initialization std::vector A(lda * n * bc); gesvdx_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // execute computations: // complementary execution to compute all singular vectors if needed if(mT * nT > 0) { CHECK_ROCBLAS_ERROR(rocsolver_gesvdx_notransv( STRIDED, handle, left_svectT, right_svectT, srange, mT, nT, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dUT.data(), lduT, stUT, dVT.data(), ldvT, stVT, difail.data(), stF, dinfo.data(), bc)); if(left_svect == rocblas_svect_none && right_svect != rocblas_svect_none) CHECK_HIP_ERROR(hUres.transfer_from(dUT)); if(right_svect == rocblas_svect_none && left_svect != rocblas_svect_none) CHECK_HIP_ERROR(hVres.transfer_from(dVT)); } gesvdx_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { //cpu_gesvdx(rocblas_svect_none, rocblas_svect_none, srange, m, n, hA[b], lda, vl, vu, il, iu, hNsv[b], hS[b], hU[b], ldu, hV[b], ldv, // work.data(), lwork, rwork.data(), hifail[b], hinfo[b]); /*** WORKAROUND: ***/ cpu_gesvd(rocblas_svect_none, rocblas_svect_none, m, n, hA[b], lda, hS[b], hU[b], ldu, hV[b], ldv, work.data(), lwork, rwork.data(), hinfo[b]); hNsv[b][0] = 0; offset[b] = -1; if(srange == rocblas_srange_index) { offset[b] = il - 1; hNsv[b][0] = iu - il + 1; } else if(srange == rocblas_srange_value) { for(int j = 0; j < minn; ++j) { if(hS[b][j] < vu && hS[b][j] >= vl) { if(offset[b] == -1) offset[b] = j; hNsv[b][0]++; } } } else { offset[b] = 0; hNsv[b][0] = minn; } /*******************/ } // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_gesvdx_notransv( STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc)); CHECK_HIP_ERROR(hSres.transfer_from(dS)); CHECK_HIP_ERROR(hNsvRes.transfer_from(dNsv)); CHECK_HIP_ERROR(hifailRes.transfer_from(difail)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(left_svect == rocblas_svect_singular) CHECK_HIP_ERROR(hUres.transfer_from(dU)); if(right_svect == rocblas_svect_singular) CHECK_HIP_ERROR(hVres.transfer_from(dV)); *max_err = 0; *max_errv = 0; // Check info and ifail for non-convergence // (NOTE: With the workaround in place, info and ifail cannot be tested as they have different // meaning in gesvd_, however, We expect the used input matrices to always converge) /*for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; for(int j = 0; j < hNsv[b][0]; ++j) { EXPECT_EQ(hifail[b][j], hifailRes[b][j]) << "where b = " << b << ", j = " << j; if(hifail[b][j] != hifailRes[b][j]) *max_err += 1; } }*/ double err = 0; for(rocblas_int b = 0; b < bc; ++b) { // check number of computed singular values rocblas_int nn = hNsvRes[b][0]; *max_err += std::abs(nn - hNsv[b][0]); EXPECT_EQ(hNsv[b][0], hNsvRes[b][0]) << "where b = " << b; // error is ||hS - hSres|| err = norm_error('F', 1, nn, 1, hS[b] + offset[b], hSres[b]); //WORKAROUND *max_err = err > *max_err ? err : *max_err; // Check the singular vectors if required if(hinfo[b][0] == 0 && (left_svect != rocblas_svect_none || right_svect != rocblas_svect_none)) { // U and V should be orthonormal, if they are then U^T*U and V^T*V should be the identity if(nn > 0) { std::vector UUres(nn * nn, 0.0); std::vector VVres(nn * nn, 0.0); std::vector I(nn * nn, 0.0); for(rocblas_int i = 0; i < nn; i++) I[i + i * nn] = T(1); cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, m, T(1), hUres[b], ldures, hUres[b], ldures, T(0), UUres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), UUres.data()); *max_errv = err > *max_errv ? err : *max_errv; cpu_gemm(rocblas_operation_conjugate_transpose, rocblas_operation_none, nn, nn, n, T(1), hVres[b], ldvres, hVres[b], ldvres, T(0), VVres.data(), nn); err = norm_error('F', nn, nn, nn, I.data(), VVres.data()); *max_errv = err > *max_errv ? err : *max_errv; } err = 0; // check singular vectors implicitly (A*v_k = s_k*u_k) for(rocblas_int k = 0; k < nn; ++k) { T tmp = 0; double tmp2 = 0; // (Comparing absolute values to deal with the fact that the pair of singular vectors (u,-v) or (-u,v) are // both ok and we could get either one with the complementary or main executions when only // one side set of vectors is required. May be revisited in the future.) for(rocblas_int i = 0; i < m; ++i) { tmp = 0; for(rocblas_int j = 0; j < n; ++j) tmp += A[b * lda * n + i + j * lda] * hVres[b][j + k * ldvres]; tmp2 = std::abs(tmp) - std::abs(hSres[b][k] * hUres[b][i + k * ldures]); err += tmp2 * tmp2; } } err = std::sqrt(err) / double(snorm('F', m, n, A.data() + b * lda * n, lda)); *max_errv = err > *max_errv ? err : *max_errv; } } } template void gesvdx_notransv_getPerfData(const rocblas_handle handle, const rocblas_svect left_svect, const rocblas_svect right_svect, const rocblas_srange srange, const rocblas_int m, const rocblas_int n, Wd& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNsv, Ud& dS, const rocblas_stride stS, Td& dU, const rocblas_int ldu, const rocblas_stride stU, Td& dV, const rocblas_int ldv, const rocblas_stride stV, Id& difail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Wh& hA, Ih& hNsv, Uh& hS, Th& hU, Th& hV, Ih& hifail, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { /** As per lapack's documentation, the following workspace size should work: rocblas_int minn = std::min(m,n); rocblas_int maxn = std::max(m,n); rocblas_int lwork = minn * minn + 6 * minn + maxn; rocblas_int lrwork = 17 * minn * minn; std::vector work(lwork); std::vector rwork(lrwork); HOWEVER, gesvdx_ GIVES ILLEGAL VALUE FOR ARGUMENT lwork. Making the memory query to get the correct workspace dimension: std::vector query(1); cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[0], lda, vl, vu, il, iu, hNsv[0], hS[0], hU[0], ldu, hV[0], ldv, query.data(), -1, rwork.data(), hifail[0], hinfo[0]); rocblas_int lwork = int(std::real(query[0])); std::vector work(lwork); AND NOW gesvdx_ FAILS WITH seg fault ERROR. **/ // (TODO: Need to confirm problem with gesvdx_ and report it) // For now we cannot report cpu time std::vector A(lda * n * bc); if(!perf) { //gesvdx_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) //*cpu_time_used = get_time_us_no_sync(); //for(rocblas_int b = 0; b < bc; ++b) // cpu_gesvdx(left_svect, right_svect, srange, m, n, hA[b], lda, vl, vu, il, iu, hNsv[b], hS[b], hU[b], ldu, hV[b], ldv, // work.data(), lwork, rwork.data(), hifail[b], hinfo[b]); //*cpu_time_used = get_time_us_no_sync() - *cpu_time_used; *cpu_time_used = nan(""); } gesvdx_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { gesvdx_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_gesvdx_notransv( STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { gesvdx_notransv_initData(handle, left_svect, right_svect, m, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_gesvdx_notransv(STRIDED, handle, left_svect, right_svect, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_gesvdx_notransv(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char leftvC = argus.get("left_svect"); char rightvC = argus.get("right_svect"); char srangeC = argus.get("srange"); rocblas_svect leftv = char2rocblas_svect(leftvC); rocblas_svect rightv = char2rocblas_svect(rightvC); rocblas_srange srange = char2rocblas_srange(srangeC); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", srangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", srangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", srangeC == 'I' ? 1 : 0); rocblas_int m = argus.get("m"); rocblas_int n = argus.get("n", m); rocblas_int nn = std::min(m, n); rocblas_int nsv_max = (srange == rocblas_srange_index ? iu - il + 1 : nn); rocblas_int lda = argus.get("lda", m); rocblas_int ldu = argus.get("ldu", m); rocblas_int ldv = argus.get("ldv", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stS = argus.get("strideS", nsv_max); rocblas_stride stF = argus.get("strideF", nn); rocblas_stride stU = argus.get("strideU", ldu * nsv_max); rocblas_stride stV = argus.get("strideV", ldv * nsv_max); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(rightv == rocblas_svect_overwrite || leftv == rocblas_svect_overwrite || rightv == rocblas_svect_all || leftv == rocblas_svect_all) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx_notransv( STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv( STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } /** TESTING OF SINGULAR VECTORS IS DONE IMPLICITLY, NOT EXPLICITLY COMPARING WITH LAPACK. SO, WE ALWAYS NEED TO COMPUTE THE SAME NUMBER OF ELEMENTS OF THE RIGHT AND LEFT VECTORS. WHILE DOING THIS, IF MORE VECTORS THAN THE SPECIFIED IN THE MAIN CALL NEED TO BE COMPUTED, WE DO SO WITH AN EXTRA CALL **/ rocblas_svect leftvT = rocblas_svect_none; rocblas_svect rightvT = rocblas_svect_none; rocblas_int ldvT = 1; rocblas_int lduT = 1; rocblas_int mT = 0; rocblas_int nT = 0; bool svects = (leftv != rocblas_svect_none || rightv != rocblas_svect_none); if(svects) { if(leftv == rocblas_svect_none) { leftvT = rocblas_svect_singular; lduT = m; mT = m; nT = n; } if(rightv == rocblas_svect_none) { rightvT = rocblas_svect_singular; ldvT = n; mT = m; nT = n; } } // determine sizes rocblas_int ldures = 1; rocblas_int ldvres = 1; size_t size_hSres = 0; size_t size_hUres = 0; size_t size_hVres = 0; size_t size_hifailRes = 0; size_t size_UT = 0; size_t size_VT = 0; size_t size_A = size_t(lda) * n; size_t size_S = size_t(nsv_max); size_t size_S_cpu = size_t(nn); size_t size_V = size_t(ldv) * nsv_max; size_t size_U = size_t(ldu) * nsv_max; size_t size_ifail = nn; if(argus.unit_check || argus.norm_check) { size_hifailRes = nn; size_VT = size_t(ldvT) * nsv_max; size_UT = size_t(lduT) * nsv_max; size_hSres = nsv_max; if(svects) { if(leftv == rocblas_svect_none) { size_hUres = size_UT; ldures = lduT; } else { size_hUres = size_U; ldures = ldu; } if(rightv == rocblas_svect_none) { size_hVres = size_VT; ldvres = ldvT; } else { size_hVres = size_V; ldvres = ldv; } } } rocblas_stride stS_cpu = size_S_cpu; rocblas_stride stUT = size_UT; rocblas_stride stVT = size_VT; rocblas_stride stUres = size_hUres; rocblas_stride stVres = size_hVres; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0, max_errorv = 0; // check invalid sizes bool invalid_size = (n < 0 || m < 0 || lda < m || ldu < 1 || ldv < 1 || bc < 0) || (leftv == rocblas_svect_singular && ldu < m) || (rightv == rocblas_svect_singular && ldv < n) || (srange == rocblas_srange_value && (vl < 0 || vl >= vu)) || (srange == rocblas_srange_index && (il < 1 || iu < 0)) || (srange == rocblas_srange_index && (iu > nn || (nn > 0 && il > iu))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_gesvdx_notransv( STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv( STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) { CHECK_ALLOC_QUERY(rocsolver_gesvdx_notransv( STRIDED, handle, leftv, rightv, srange, m, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdx_notransv( STRIDED, handle, leftvT, rightvT, srange, mT, nT, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); } else { CHECK_ALLOC_QUERY(rocsolver_gesvdx_notransv( STRIDED, handle, leftv, rightv, srange, m, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, ldu, stU, (T*)nullptr, ldv, stV, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); CHECK_ALLOC_QUERY(rocsolver_gesvdx_notransv( STRIDED, handle, leftvT, rightvT, srange, mT, nT, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stS, (T*)nullptr, lduT, stUT, (T*)nullptr, ldvT, stVT, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); } size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hS(size_S_cpu, 1, stS_cpu, bc); // extra space for cpu_gesvd call host_strided_batch_vector hV(size_V, 1, stV, bc); host_strided_batch_vector hU(size_U, 1, stU, bc); host_strided_batch_vector hNsv(1, 1, 1, bc); host_strided_batch_vector hNsvRes(1, 1, 1, bc); host_strided_batch_vector hifail(12 * nn, 1, stF, bc); host_strided_batch_vector hifailRes(size_hifailRes, 1, stF, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hSres(size_hSres, 1, stS, bc); host_strided_batch_vector hVres(size_hVres, 1, stVres, bc); host_strided_batch_vector hUres(size_hUres, 1, stUres, bc); // device device_strided_batch_vector dS(size_S, 1, stS, bc); device_strided_batch_vector dV(size_V, 1, stV, bc); device_strided_batch_vector dU(size_U, 1, stU, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); device_strided_batch_vector dNsv(1, 1, 1, bc); device_strided_batch_vector difail(size_ifail, 1, stF, bc); device_strided_batch_vector dVT(size_VT, 1, stVT, bc); device_strided_batch_vector dUT(size_UT, 1, stUT, bc); if(size_VT) CHECK_HIP_ERROR(dVT.memcheck()); if(size_UT) CHECK_HIP_ERROR(dUT.memcheck()); if(size_S) CHECK_HIP_ERROR(dS.memcheck()); if(size_V) CHECK_HIP_ERROR(dV.memcheck()); if(size_U) CHECK_HIP_ERROR(dU.memcheck()); if(size_ifail) CHECK_HIP_ERROR(difail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); CHECK_HIP_ERROR(dNsv.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, leftv, rightv, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdx_notransv_getError( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hNsv, hNsvRes, hS, hSres, hU, hUres, ldures, hV, hVres, ldvres, hifail, hifailRes, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdx_notransv_getPerfData( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, hA, hNsv, hS, hU, hV, hifail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || m == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_gesvdx_notransv(STRIDED, handle, leftv, rightv, srange, m, n, dA.data(), lda, stA, vl, vu, il, iu, dNsv.data(), dS.data(), stS, dU.data(), ldu, stU, dV.data(), ldv, stV, difail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { gesvdx_notransv_getError( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, leftvT, rightvT, mT, nT, dUT, lduT, stUT, dVT, ldvT, stVT, hA, hNsv, hNsvRes, hS, hSres, hU, hUres, ldures, hV, hVres, ldvres, hifail, hifailRes, hinfo, hinfoRes, &max_error, &max_errorv); } // collect performance data if(argus.timing) { gesvdx_notransv_getPerfData( handle, leftv, rightv, srange, m, n, dA, lda, stA, vl, vu, il, iu, dNsv, dS, stS, dU, ldu, stU, dV, ldv, stV, difail, stF, dinfo, bc, hA, hNsv, hS, hU, hV, hifail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * min(m, n) * machine_precision as tolerance if(argus.unit_check) { ROCSOLVER_TEST_CHECK(T, max_error, 2 * std::min(m, n)); if(svects) ROCSOLVER_TEST_CHECK(T, max_errorv, 4 * std::min(m, n)); } // output results for rocsolver-bench if(argus.timing) { if(svects) max_error = (max_error >= max_errorv) ? max_error : max_errorv; if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "vl", "vu", "il", "iu", "strideS", "ldu", "strideU", "ldv", "strideV", "strideF", "batch_c"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, vl, vu, il, iu, stS, ldu, stU, ldv, stV, stF, bc); } else if(STRIDED) { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "strideA", "vl", "vu", "il", "iu", "strideS", "ldu", "strideU", "ldv", "strideV", "strideF", "batch_c"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, stA, vl, vu, il, iu, stS, ldu, stU, ldv, stV, stF, bc); } else { rocsolver_bench_output("left_svect", "right_svect", "srange", "m", "n", "lda", "vl", "vu", "il", "iu", "ldu", "ldv"); rocsolver_bench_output(leftvC, rightvC, srangeC, m, n, lda, vl, vu, il, iu, ldu, ldv); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getf2_getrf.cpp000066400000000000000000000034771503202240500250270ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getf2_getrf.hpp" #define TESTING_GETF2_GETRF(...) template void testing_getf2_getrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETF2_GETRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getf2_getrf.hpp000066400000000000000000000563441503202240500250350ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getf2_getrf_checkBadArgs(const rocblas_handle handle, const I m, const I n, Td dA, const I lda, const rocblas_stride stA, Id dIpiv, const rocblas_stride stP, Id dInfo, const I bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf(STRIDED, GETRF, nullptr, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (Td) nullptr, lda, stA, dIpiv, stP, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, (Id) nullptr, stP, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, (Id) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, 0, n, (Td) nullptr, lda, stA, (Id) nullptr, stP, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, 0, (Td) nullptr, lda, stA, (Id) nullptr, stP, dInfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, (Id) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, 0), rocblas_status_success); } template void testing_getf2_getrf_bad_arg() { // safe arguments rocblas_local_handle handle; I m = 1; I n = 1; I lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; I bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getf2_getrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getf2_getrf_checkBadArgs(handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } } template void getf2_getrf_initData(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dIpiv, const rocblas_stride stP, Id& dInfo, const I bc, Th& hA, Uh& hIpiv, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(I b = 0; b < bc; ++b) { // scale A to avoid singularities for(I i = 0; i < m; i++) { for(I j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(I i = 0; i < m / 2; i++) { for(I j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][m - 1 - i + j * lda]; hA[b][m - 1 - i + j * lda] = tmp; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero pivot in those // matrices in the batch that are singular I j = n / 4 + b; j -= (j / n) * n; for(I i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(I i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(I i = 0; i < m; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getf2_getrf_getError(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dIpiv, const rocblas_stride stP, Id& dInfo, const I bc, Th& hA, Th& hARes, Uh& hIpiv, Ih& hIpivRes, Uh& hInfo, Ih& hInfoRes, double* max_err, const bool singular, size_t& hashA, size_t& hashARes, size_t& hashIpivRes) { // input data initialization getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, singular); // compute input hashes hashA = deterministic_hash(hA, bc); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hIpivRes.transfer_from(dIpiv)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(I b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hInfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hInfo[b]); } // compute output hashes hashARes = deterministic_hash(hARes, bc); hashIpivRes = deterministic_hash(hIpivRes); // expecting original matrix to be non-singular // error is ||hA - hARes|| / ||hA|| (ideally ||LU - Lres Ures|| / ||LU||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(I b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; // also check pivoting (count the number of incorrect pivots) err = 0; for(I i = 0; i < min(m, n); ++i) { EXPECT_EQ(hIpiv[b][i], hIpivRes[b][i]) << "where b = " << b << ", i = " << i; if(hIpiv[b][i] != hIpivRes[b][i]) err++; } *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(I b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void getf2_getrf_getPerfData(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dIpiv, const rocblas_stride stP, Id& dInfo, const I bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(I b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hInfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, singular); CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { getf2_getrf_initData(handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, singular); start = get_time_us_sync(stream); rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getf2_getrf(Arguments& argus) { // get arguments rocblas_local_handle handle; I m = argus.get("m"); I n = argus.get("n", m); I lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); I bc = argus.batch_count; int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stA : 0; rocblas_stride stPRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stP : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t hashA = 0, hashARes = 0, hashIpivRes = 0; size_t size_ARes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_A : 0; size_t size_PRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_P : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (I*)nullptr, stP, (I*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (I*)nullptr, stP, (I*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (I*)nullptr, stP, (I*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (I*)nullptr, stP, (I*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) getf2_getrf_getError( handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashARes, hashIpivRes); // collect performance data if(argus.timing) getf2_getrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) getf2_getrf_getError( handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashARes, hashIpivRes); // collect performance data if(argus.timing) getf2_getrf_getPerfData( handle, m, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using min(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, min(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(m, n, lda, stA, stP, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); if(argus.hash_check) { rocsolver_bench_output("hash(A)", "hash(ARes)", "hash(ipivRes)"); rocsolver_bench_output(ROCSOLVER_FORMAT_HASH(hashA), ROCSOLVER_FORMAT_HASH(hashARes), ROCSOLVER_FORMAT_HASH(hashIpivRes)); rocsolver_bench_endl(); } } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETF2_GETRF(...) \ extern template void testing_getf2_getrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETF2_GETRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getf2_getrf_npvt.cpp000066400000000000000000000035311503202240500260650ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getf2_getrf_npvt.hpp" #define TESTING_GETF2_GETRF_NPVT(...) \ template void testing_getf2_getrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETF2_GETRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getf2_getrf_npvt.hpp000066400000000000000000000472211503202240500260760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getf2_getrf_npvt_checkBadArgs(const rocblas_handle handle, const I m, const I n, Td dA, const I lda, const rocblas_stride stA, Id dInfo, const I bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, nullptr, m, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA, lda, stA, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (Td) nullptr, lda, stA, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA, lda, stA, (Id) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, 0, n, (Td) nullptr, lda, stA, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, 0, (Td) nullptr, lda, stA, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA, lda, stA, dInfo, 0), rocblas_status_success); } template void testing_getf2_getrf_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; I m = 1; I n = 1; I lda = 1; rocblas_stride stA = 1; I bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getf2_getrf_npvt_checkBadArgs(handle, m, n, dA.data(), lda, stA, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getf2_getrf_npvt_checkBadArgs(handle, m, n, dA.data(), lda, stA, dInfo.data(), bc); } } template void getf2_getrf_npvt_initData(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dInfo, const I bc, Th& hA, const bool singular) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(I b = 0; b < bc; ++b) { for(I i = 0; i < m; i++) { for(I j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero element in the // diagonal of those matrices in the batch that are singular I j = n / 4 + b; j -= (j / n) * n; for(I i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(I i = 0; i < m; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(I i = 0; i < m; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getf2_getrf_npvt_getError(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dInfo, const I bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Ih& hInfoRes, double* max_err, const bool singular) { // input data initialization getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dInfo, bc, hA, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(I b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hInfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hInfo[b]); } // expecting original matrix to be non-singular // error is ||hA - hARes|| / ||hA|| (ideally ||LU - Lres Ures|| / ||LU||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(I b = 0; b < bc; ++b) { err = norm_error('F', m, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for singularities err = 0; for(I b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void getf2_getrf_npvt_getPerfData(const rocblas_handle handle, const I m, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dInfo, const I bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dInfo, bc, hA, singular); // cpu-lapack performance (only if no perf mode) *cpu_time_used = get_time_us_no_sync(); for(I b = 0; b < bc; ++b) { GETRF ? cpu_getrf(m, n, hA[b], lda, hIpiv[b], hInfo[b]) : cpu_getf2(m, n, hA[b], lda, hIpiv[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dInfo, bc, hA, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dInfo, bc, hA, singular); CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { getf2_getrf_npvt_initData(handle, m, n, dA, lda, stA, dInfo, bc, hA, singular); start = get_time_us_sync(stream); rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getf2_getrf_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; I m = argus.get("m"); I n = argus.get("n", m); I lda = argus.get("lda", m); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", min(m, n)); I bc = argus.batch_count; int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(min(m, n)); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (m < 0 || n < 0 || lda < m || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (I*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (I*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getf2_getrf_npvt( STRIDED, GETRF, handle, m, n, (T* const*)nullptr, lda, stA, (I*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, (T*)nullptr, lda, stA, (I*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getf2_getrf_npvt_getError(handle, m, n, dA, lda, stA, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getf2_getrf_npvt_getPerfData( handle, m, n, dA, lda, stA, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(m == 0 || n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf_npvt(STRIDED, GETRF, handle, m, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getf2_getrf_npvt_getError(handle, m, n, dA, lda, stA, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getf2_getrf_npvt_getPerfData( handle, m, n, dA, lda, stA, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using min(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, min(m, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("m", "n", "lda", "batch_c"); rocsolver_bench_output(m, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("m", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(m, n, lda, stA, bc); } else { rocsolver_bench_output("m", "n", "lda"); rocsolver_bench_output(m, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETF2_GETRF_NPVT(...) \ extern template void testing_getf2_getrf_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETF2_GETRF_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getrf_large.hpp000066400000000000000000000506031503202240500251100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" /* * =========================================================================== * testing_getrf_large tests the correctness of getrf, getrs, and gemm. We * use an implicit test that solves Ax = b for x (using getrf and getrs), * then computes Ax (using gemm) and compare with b. For the sizes tested, * this is much faster than calling getrf on the CPU. * =========================================================================== */ template , int> = 0> void getrf_large_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dX.transfer_from(hB)); } } template , int> = 0> void getrf_large_initData(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += T(400, 400); else hA[b][i + j * lda] -= T(4, 4); } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // When required, add some singularities // (always the same elements for debugging purposes). // The algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) hA[b][i + j * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dX.transfer_from(hB)); } } template void getrf_large_getError(const rocblas_handle handle, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Td& dX, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // Input data initialization for Matrix A getrf_large_initData(handle, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, dIpiv, stP, dInfo, bc, hA, hB, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getf2_getrf(STRIDED, GETRF, handle, n, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); // Solve Ax = b for x CHECK_ROCBLAS_ERROR(rocsolver_getrs(STRIDED, handle, rocblas_operation_none, n, nrhs, dA, lda, stA, dIpiv, stP, dX, ldb, stB, bc)); // Resetting the value of dA. CHECK_HIP_ERROR(dA.transfer_from(hA)); // Compute Ax T alpha = T(1), beta = T(0); CHECK_ROCBLAS_ERROR(rocblas_gemm(STRIDED, handle, rocblas_operation_none, rocblas_operation_none, n, nrhs, n, &alpha, dA, lda, stA, dX, ldb, stB, &beta, dB, ldb, stB, bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { // Pass the matrices here err = norm_error('F', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } } // Function for perfromance data. template void getrf_large_getPerfData(const rocblas_handle handle, const rocblas_int m, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { *cpu_time_used = nan(""); // no timing on cpu-lapack execution *gpu_time_used = nan(""); // no timing on gpu-lapack execution } template void testing_getrf_large(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; rocblas_stride stPRes = (argus.unit_check || argus.norm_check) ? stP : 0; // determine sizes using the leading Dimensions, which are typically greater than the rows size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; size_t size_PRes = (argus.unit_check || argus.norm_check) ? size_P : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { // if(BATCHED) // EXPECT_ROCBLAS_STATUS( // rocsolver_getf2_getrf(STRIDED, GETRF, handle, n, n, (T* const*)nullptr, lda, stA, // (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), // rocblas_status_invalid_size); // else // EXPECT_ROCBLAS_STATUS(rocsolver_getf2_getrf(STRIDED, GETRF, handle, n, n, (T*)nullptr, // lda, stA, (rocblas_int*)nullptr, stP, // (rocblas_int*)nullptr, bc), // rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getf2_getrf(STRIDED, GETRF, handle, n, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getf2_getrf(STRIDED, GETRF, handle, n, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dX(size_B, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) { CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); } if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check computations if(argus.unit_check || argus.norm_check) // Modify the parameters passed. getrf_large_getError(handle, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, dIpiv, stP, dInfo, bc, hA, hB, hBRes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getrf_large_getPerfData( handle, n, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dX(size_B, 1, stB, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) { CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dX.memcheck()); } if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check computations if(argus.unit_check || argus.norm_check) getrf_large_getError(handle, n, nrhs, dA, lda, stA, dB, ldb, stB, dX, dIpiv, stP, dInfo, bc, hA, hB, hBRes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // The perf function must return NAN // collect performance data if(argus.timing) getrf_large_getPerfData( handle, n, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using min(m,n) * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, min(n, n)); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "nrhs", "lda", "ldb", "strideP", "batch_c"); rocsolver_bench_output(n, nrhs, lda, ldb, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "nrhs", "lda", "strideA", "ldb", "strideB", "strideP", "batch_c"); rocsolver_bench_output(n, nrhs, lda, stA, ldb, stB, stP, bc); } else { rocsolver_bench_output("n", "nrhs", "lda", "ldb"); rocsolver_bench_output(n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri.cpp000066400000000000000000000033001503202240500237240ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getri.hpp" #define TESTING_GETRI(...) template void testing_getri<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri.hpp000066400000000000000000000475441503202240500237530ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getri_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, nullptr, n, dA, lda, stA, dIpiv, stP, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, (T) nullptr, lda, stA, dIpiv, stP, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, dA, lda, stA, (U) nullptr, stP, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, 0, (T) nullptr, lda, stA, (U) nullptr, stP, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_getri(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } } template void getri_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Ud& dIpiv, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = hA[b][i + j * lda] / 10.0 + 10; //+= 400; else hA[b][i + j * lda] = (hA[b][i + j * lda] - 4) / 10.0; // -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void getri_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void getri_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_getError(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_getPerfData(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_getError(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_getPerfData(handle, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "strideP", "batch_c"); rocsolver_bench_output(n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(n, lda, stA, stP, bc); } else { rocsolver_bench_output("n", "lda"); rocsolver_bench_output(n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI(...) extern template void testing_getri<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri_npvt.cpp000066400000000000000000000033241503202240500250010ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getri_npvt.hpp" #define TESTING_GETRI_NPVT(...) template void testing_getri_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri_npvt.hpp000066400000000000000000000442141503202240500250110ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getri_npvt_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, nullptr, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, dA, lda, stA, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, 0, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_npvt_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_checkBadArgs(handle, n, dA.data(), lda, stA, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_checkBadArgs(handle, n, dA.data(), lda, stA, dInfo.data(), bc); } } template void getri_npvt_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getri_npvt_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void getri_npvt_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_npvt_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri_npvt(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri_npvt(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri_npvt(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_getError(handle, n, dA, lda, stA, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_getPerfData(handle, n, dA, lda, stA, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_getri_npvt(STRIDED, handle, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_getError(handle, n, dA, lda, stA, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_getPerfData(handle, n, dA, lda, stA, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "batch_c"); rocsolver_bench_output(n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "batch_c"); rocsolver_bench_output(n, lda, stA, bc); } else { rocsolver_bench_output("n", "lda"); rocsolver_bench_output(n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI_NPVT(...) \ extern template void testing_getri_npvt<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI_NPVT, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri_npvt_outofplace.cpp000066400000000000000000000034061503202240500272230ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getri_npvt_outofplace.hpp" #define TESTING_GETRI_NPVT_OUTOFPLACE(...) \ template void testing_getri_npvt_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI_NPVT_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri_npvt_outofplace.hpp000066400000000000000000000540031503202240500272270ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getri_npvt_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dC, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_getri_npvt_outofplace(STRIDED, nullptr, n, dA, lda, stA, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, dC, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T) nullptr, lda, stA, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, dC, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, 0, (T) nullptr, lda, stA, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA, lda, stA, dC, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_npvt_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; rocblas_stride stA = 1; rocblas_stride stC = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dC(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_npvt_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc); } } template void getri_npvt_outofplace_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities // leaving matrix as diagonal dominant so that pivoting is not required for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void getri_npvt_outofplace_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b], ldc); *max_err = err > *max_err ? err : *max_err; } } } template void getri_npvt_outofplace_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_npvt_outofplace_initData(handle, n, dA, lda, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri_npvt_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldc = argus.get("ldc", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stC = argus.get("strideC", ldc * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_C = size_t(ldc) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldc < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace( STRIDED, handle, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dC(size_C, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_outofplace_getError(handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_outofplace_getPerfData( handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_npvt_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_npvt_outofplace_getError(handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_npvt_outofplace_getPerfData( handle, n, dA, lda, stA, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "ldc", "batch_c"); rocsolver_bench_output(n, lda, ldc, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "ldc", "strideC", "batch_c"); rocsolver_bench_output(n, lda, stA, ldc, stC, bc); } else { rocsolver_bench_output("n", "lda", "ldc"); rocsolver_bench_output(n, lda, ldc); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI_NPVT_OUTOFPLACE(...) \ extern template void testing_getri_npvt_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI_NPVT_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri_outofplace.cpp000066400000000000000000000033621503202240500261550ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getri_outofplace.hpp" #define TESTING_GETRI_OUTOFPLACE(...) \ template void testing_getri_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRI_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getri_outofplace.hpp000066400000000000000000000572171503202240500261720ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getri_outofplace_checkBadArgs(const rocblas_handle handle, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, T dC, const rocblas_int ldc, const rocblas_stride stC, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, nullptr, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_handle); // values // N/A // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, (T) nullptr, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, (U) nullptr, stP, dC, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, 0, (T) nullptr, lda, stA, (U) nullptr, stP, (T) nullptr, ldc, stC, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, (U) nullptr, 0), rocblas_status_success); } template void testing_getri_outofplace_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldc = 1; rocblas_stride stA = 1; rocblas_stride stC = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dC(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dC(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dC.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments getri_outofplace_checkBadArgs(handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc); } } template void getri_outofplace_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, Ud& dIpiv, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } // do the LU decomposition of matrix A w/ the reference LAPACK routine cpu_getrf(n, n, hA[b], lda, hIpiv[b], hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void getri_outofplace_getError(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); // input data initialization getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dC)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b], ldc); *max_err = err > *max_err ? err : *max_err; } } } template void getri_outofplace_getPerfData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Td& dC, const rocblas_int ldc, const rocblas_stride stC, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { rocblas_int sizeW = n; std::vector hW(sizeW); if(!perf) { getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_getri(n, hA[b], lda, hIpiv[b], hW.data(), sizeW, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { getri_outofplace_initData(handle, n, dA, lda, dIpiv, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getri_outofplace(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldc = argus.get("ldc", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stC = argus.get("strideC", ldc * n); rocblas_stride stP = argus.get("strideP", n); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stC : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_C = size_t(ldc) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_C : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldc < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getri_outofplace( STRIDED, handle, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T* const*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_getri_outofplace(STRIDED, handle, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (T*)nullptr, ldc, stC, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dC(size_C, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_outofplace_getError(handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_outofplace_getPerfData( handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dC(size_C, 1, stC, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_C) CHECK_HIP_ERROR(dC.memcheck()); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getri_outofplace(STRIDED, handle, n, dA.data(), lda, stA, dIpiv.data(), stP, dC.data(), ldc, stC, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getri_outofplace_getError(handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hARes, hIpiv, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) getri_outofplace_getPerfData( handle, n, dA, lda, stA, dIpiv, stP, dC, ldc, stC, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("n", "lda", "strideP", "ldc", "batch_c"); rocsolver_bench_output(n, lda, stP, ldc, bc); } else if(STRIDED) { rocsolver_bench_output("n", "lda", "strideA", "strideP", "ldc", "strideC", "batch_c"); rocsolver_bench_output(n, lda, stA, stP, ldc, stC, bc); } else { rocsolver_bench_output("n", "lda", "ldc"); rocsolver_bench_output(n, lda, ldc); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRI_OUTOFPLACE(...) \ extern template void testing_getri_outofplace<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRI_OUTOFPLACE, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getrs.cpp000066400000000000000000000033221503202240500237420ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_getrs.hpp" #define TESTING_GETRS(...) template void testing_getrs<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_GETRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_getrs.hpp000066400000000000000000000522321503202240500237530ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void getrs_checkBadArgs(const rocblas_handle handle, const rocblas_operation trans, const I n, const I nrhs, Td dA, const I lda, const rocblas_stride stA, Id dIpiv, const rocblas_stride stP, Td dB, const I ldb, const rocblas_stride stB, const I bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, nullptr, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, rocblas_operation(0), n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (Td) nullptr, lda, stA, dIpiv, stP, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, (Id) nullptr, stP, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, (Td) nullptr, ldb, stB, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, 0, nrhs, (Td) nullptr, lda, stA, (Id) nullptr, stP, (Td) nullptr, ldb, stB, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, 0, dA, lda, stA, dIpiv, stP, (Td) nullptr, ldb, stB, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, 0), rocblas_status_success); } template void testing_getrs_bad_arg() { // safe arguments rocblas_local_handle handle; I n = 1; I nrhs = 1; I lda = 1; I ldb = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_stride stB = 1; I bc = 1; rocblas_operation trans = rocblas_operation_none; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments getrs_checkBadArgs(handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); // check bad arguments getrs_checkBadArgs(handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc); } } template void getrs_initData(const rocblas_handle handle, const rocblas_operation trans, const I n, const I nrhs, Td& dA, const I lda, const rocblas_stride stA, Id& dIpiv, const rocblas_stride stP, Td& dB, const I ldb, const rocblas_stride stB, const I bc, Th& hA, Ih& hIpiv, Uh& hIpiv_cpu, Th& hB) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); // scale A to avoid singularities for(I b = 0; b < bc; ++b) { for(I i = 0; i < n; i++) { for(I j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } // do the LU decomposition of matrix A w/ the reference LAPACK routine for(I b = 0; b < bc; ++b) { int info; cpu_getrf(n, n, hA[b], lda, hIpiv_cpu[b], &info); for(I i = 0; i < n; i++) hIpiv[b][i] = hIpiv_cpu[b][i]; } } if(GPU) { // now copy pivoting indices and matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); CHECK_HIP_ERROR(dIpiv.transfer_from(hIpiv)); } } template void getrs_getError(const rocblas_handle handle, const rocblas_operation trans, const I n, const I nrhs, Td& dA, const I lda, const rocblas_stride stA, Id& dIpiv, const rocblas_stride stP, Td& dB, const I ldb, const rocblas_stride stB, const I bc, Th& hA, Ih& hIpiv, Uh& hIpiv_cpu, Th& hB, Th& hBRes, double* max_err) { // input data initialization getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); // CPU lapack for(I b = 0; b < bc; ++b) { cpu_getrs(trans, n, nrhs, hA[b], lda, hIpiv_cpu[b], hB[b], ldb); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(I b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } } template void getrs_getPerfData(const rocblas_handle handle, const rocblas_operation trans, const I n, const I nrhs, Td& dA, const I lda, const rocblas_stride stA, Id& dIpiv, const rocblas_stride stP, Td& dB, const I ldb, const rocblas_stride stB, const I bc, Th& hA, Ih& hIpiv, Uh& hIpiv_cpu, Th& hB, double* gpu_time_used, double* cpu_time_used, const int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(I b = 0; b < bc; ++b) { cpu_getrs(trans, n, nrhs, hA[b], lda, hIpiv_cpu[b], hB[b], ldb); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB); // cold calls for(int iter = 0; iter < 2; iter++) { getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB); CHECK_ROCBLAS_ERROR(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(int iter = 0; iter < hot_calls; iter++) { getrs_initData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB); start = get_time_us_sync(stream); rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_getrs(Arguments& argus) { // get arguments rocblas_local_handle handle; char transC = argus.get("trans"); I n = argus.get("n"); I nrhs = argus.get("nrhs", n); I lda = argus.get("lda", n); I ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_operation trans = char2rocblas_operation(transC); I bc = argus.batch_count; int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values // N/A // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T* const*)nullptr, lda, stA, (I*)nullptr, stP, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T*)nullptr, lda, stA, (I*)nullptr, stP, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T* const*)nullptr, lda, stA, (I*)nullptr, stP, (T* const*)nullptr, ldb, stB, bc)); else CHECK_ALLOC_QUERY(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, (T*)nullptr, lda, stA, (I*)nullptr, stP, (T*)nullptr, ldb, stB, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpiv_cpu(size_P, 1, stP, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getrs_getError(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB, hBRes, &max_error); // collect performance data if(argus.timing) getrs_getPerfData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpiv_cpu(size_P, 1, stP, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_getrs(STRIDED, handle, trans, n, nrhs, dA.data(), lda, stA, dIpiv.data(), stP, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) getrs_getError(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB, hBRes, &max_error); // collect performance data if(argus.timing) getrs_getPerfData(handle, trans, n, nrhs, dA, lda, stA, dIpiv, stP, dB, ldb, stB, bc, hA, hIpiv, hIpiv_cpu, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("trans", "n", "nrhs", "lda", "ldb", "strideP", "batch_c"); rocsolver_bench_output(transC, n, nrhs, lda, ldb, stP, bc); } else if(STRIDED) { rocsolver_bench_output("trans", "n", "nrhs", "lda", "ldb", "strideA", "strideP", "strideB", "batch_c"); rocsolver_bench_output(transC, n, nrhs, lda, ldb, stA, stP, stB, bc); } else { rocsolver_bench_output("trans", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(transC, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_GETRS(...) extern template void testing_getrs<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_GETRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_posv.cpp000066400000000000000000000032741503202240500236130ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_posv.hpp" #define TESTING_POSV(...) template void testing_posv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POSV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_posv.hpp000066400000000000000000000531421503202240500236170ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void posv_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, nullptr, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, rocblas_fill_full, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T) nullptr, lda, stA, dB, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, 0, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, 0, dA, lda, stA, (T) nullptr, ldb, stB, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, 0), rocblas_status_success); } template void testing_posv_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int nrhs = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_int bc = 1; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments posv_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments posv_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); } } template void posv_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, const bool singular) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); for(rocblas_int b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(rocblas_int i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } if(GPU) { // now copy matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void posv_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Th& hBRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_posv(uplo, n, nrhs, hA[b], lda, hB[b], ldb, hInfo[b]); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } // also check info for non positive definite cases err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void posv_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, const rocblas_int nrhs, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_posv(uplo, n, nrhs, hA[b], lda, hB[b], ldb, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); // cold calls for(int iter = 0; iter < 2; iter++) { posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); CHECK_ROCBLAS_ERROR(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { posv_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, singular); start = get_time_us_sync(stream); rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_posv(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int nrhs = argus.get("nrhs", n); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) posv_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) posv_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_posv(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) posv_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hBRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) posv_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, dInfo, bc, hA, hB, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "strideA", "strideB", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, stA, stB, bc); } else { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POSV(...) extern template void testing_posv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POSV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_potf2_potrf.cpp000066400000000000000000000034771503202240500250750ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_potf2_potrf.hpp" #define TESTING_POTF2_POTRF(...) template void testing_potf2_potrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POTF2_POTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_potf2_potrf.hpp000066400000000000000000000510721503202240500250740ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void potf2_potrf_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const I n, T dA, const I lda, const rocblas_stride stA, U dinfo, const I bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, nullptr, uplo, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, rocblas_fill_full, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, 0, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA, lda, stA, dinfo, 0), rocblas_status_success); } template void testing_potf2_potrf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; I n = 1; I lda = 1; rocblas_stride stA = 1; I bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potf2_potrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potf2_potrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } } template void potf2_potrf_initData(const rocblas_handle handle, const rocblas_fill uplo, const I n, Td& dA, const I lda, const rocblas_stride stA, Ud& dInfo, const I bc, Th& hA, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); for(I b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(I i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite I i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void potf2_potrf_getError(const rocblas_handle handle, const rocblas_fill uplo, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dInfo, const I bc, Th& hA, Th& hARes, Uh& hInfo, Ih& hInfoRes, double* max_err, const bool singular, size_t& hashA, size_t& hashARes) { // input data initialization potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // hash input hashA = deterministic_hash(hA, bc); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // hash output hashARes = deterministic_hash(hARes, bc); // CPU lapack for(I b = 0; b < bc; ++b) { POTRF ? cpu_potrf(uplo, n, hA[b], lda, hInfo[b]) : cpu_potf2(uplo, n, hA[b], lda, hInfo[b]); } // error is ||hA - hARes|| / ||hA|| (ideally ||LL' - Lres Lres'|| / ||LL'||) // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; I nn; *max_err = 0; for(I b = 0; b < bc; ++b) { nn = hInfoRes[b][0] == 0 ? n : hInfoRes[b][0]; // (TODO: For now, the algorithm is modifying the whole input matrix even when // it is not positive definite. So we only check the principal nn-by-nn submatrix. // Once this is corrected, nn could be always equal to n.) *max_err = (uplo == rocblas_fill_lower) ? norm_error_lowerTr('F', nn, nn, lda, hA[b], hARes[b]) : norm_error_upperTr('F', nn, nn, lda, hA[b], hARes[b]); } // also check info for non positive definite cases err = 0; for(I b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void potf2_potrf_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const I n, Td& dA, const I lda, const rocblas_stride stA, Id& dInfo, const I bc, Th& hA, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(I b = 0; b < bc; ++b) { POTRF ? cpu_potrf(uplo, n, hA[b], lda, hInfo[b]) : cpu_potf2(uplo, n, hA[b], lda, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { potf2_potrf_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); start = get_time_us_sync(stream); rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_potf2_potrf(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); I n = argus.get("n"); I lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_fill uplo = char2rocblas_fill(uploC); I bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (I*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T*)nullptr, lda, stA, (I*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t hashA = 0, hashARes = 0; size_t size_ARes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (I*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T*)nullptr, lda, stA, (I*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (I*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, (T*)nullptr, lda, stA, (I*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) potf2_potrf_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashARes); // collect performance data if(argus.timing) potf2_potrf_getPerfData( handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potf2_potrf(STRIDED, POTRF, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) potf2_potrf_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashARes); // collect performance data if(argus.timing) potf2_potrf_getPerfData( handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "batch_c"); rocsolver_bench_output(uploC, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); if(argus.hash_check) { rocsolver_bench_output("hash(A)", "hash(ARes)"); rocsolver_bench_output(ROCSOLVER_FORMAT_HASH(hashA), ROCSOLVER_FORMAT_HASH(hashARes)); rocsolver_bench_endl(); } } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POTF2_POTRF(...) \ extern template void testing_potf2_potrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POTF2_POTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_potri.cpp000066400000000000000000000033001503202240500237470ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_potri.hpp" #define TESTING_POTRI(...) template void testing_potri<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_potri.hpp000066400000000000000000000446751503202240500240000ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void potri_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, nullptr, uplo, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_potri(STRIDED, handle, rocblas_fill_full, n, dA, lda, stA, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, 0, (T) nullptr, lda, stA, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, dA, lda, stA, dinfo, 0), rocblas_status_success); } template void testing_potri_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potri_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments potri_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dinfo.data(), bc); } } template void potri_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, const bool singular) { if(CPU) { rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(rocblas_int i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; // do the Cholesky factorization of matrix A w/ the reference LAPACK routine cpu_potrf(uplo, n, hA[b], lda, hInfo[b]); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero elemtent in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void potri_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_potri(uplo, n, hA[b], lda, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void potri_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_potri(uplo, n, hA[b], lda, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); CHECK_ROCBLAS_ERROR( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { potri_initData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, singular); start = get_time_us_sync(stream); rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_potri(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_potri(STRIDED, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_potri(STRIDED, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_potri(STRIDED, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potri_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) potri_getPerfData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_potri(STRIDED, handle, uplo, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potri_getError(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) potri_getPerfData(handle, uplo, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "batch_c"); rocsolver_bench_output(uploC, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POTRI(...) extern template void testing_potri<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_potrs.cpp000066400000000000000000000033221503202240500237650ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_potrs.hpp" #define TESTING_POTRS(...) template void testing_potrs<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_POTRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_potrs.hpp000066400000000000000000000445441503202240500240050ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void potrs_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const I n, const I nrhs, T dA, const I lda, const rocblas_stride stA, T dB, const I ldb, const rocblas_stride stB, const I bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, nullptr, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, rocblas_fill_full, n, nrhs, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T) nullptr, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, 0, nrhs, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, 0, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, 0), rocblas_status_success); } template void testing_potrs_bad_arg() { // safe arguments rocblas_local_handle handle; I n = 1; I nrhs = 1; I lda = 1; I ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; I bc = 1; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments potrs_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments potrs_checkBadArgs(handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } } template void potrs_initData(const rocblas_handle handle, const rocblas_fill uplo, const I n, const I nrhs, Td& dA, const I lda, const rocblas_stride stA, Td& dB, const I ldb, const rocblas_stride stB, const I bc, Th& hA, Th& hB) { if(CPU) { rocblas_init(hA, true); rocblas_init(hB, true); int info; for(I b = 0; b < bc; ++b) { // scale to ensure positive definiteness for(I i = 0; i < n; i++) hA[b][i + i * lda] = hA[b][i + i * lda] * sconj(hA[b][i + i * lda]) * 400; // do the Cholesky factorization of matrix A w/ the reference LAPACK routine cpu_potrf(uplo, n, hA[b], lda, &info); } } if(GPU) { // now copy matrices to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void potrs_getError(const rocblas_handle handle, const rocblas_fill uplo, const I n, const I nrhs, Td& dA, const I lda, const rocblas_stride stA, Td& dB, const I ldb, const rocblas_stride stB, const I bc, Th& hA, Th& hB, Th& hBRes, double* max_err) { // input data initialization potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); CHECK_HIP_ERROR(hBRes.transfer_from(dB)); // CPU lapack for(I b = 0; b < bc; ++b) { cpu_potrs(uplo, n, nrhs, hA[b], lda, hB[b], ldb); } // error is ||hB - hBRes|| / ||hB|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using vector-induced infinity norm double err; *max_err = 0; for(I b = 0; b < bc; ++b) { err = norm_error('I', n, nrhs, ldb, hB[b], hBRes[b]); *max_err = err > *max_err ? err : *max_err; } } template void potrs_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const I n, const I nrhs, Td& dA, const I lda, const rocblas_stride stA, Td& dB, const I ldb, const rocblas_stride stB, const I bc, Th& hA, Th& hB, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { if(!perf) { potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(I b = 0; b < bc; ++b) { cpu_potrs(uplo, n, nrhs, hA[b], lda, hB[b], ldb); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); // cold calls for(int iter = 0; iter < 2; iter++) { potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); CHECK_ROCBLAS_ERROR(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { potrs_initData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB); start = get_time_us_sync(stream); rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_potrs(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); I n = argus.get("n"); I nrhs = argus.get("nrhs", n); I lda = argus.get("lda", n); I ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * nrhs); rocblas_fill uplo = char2rocblas_fill(uploC); I bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * nrhs; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_BRes = (argus.unit_check || argus.norm_check) ? size_B : 0; // check invalid sizes bool invalid_size = (n < 0 || nrhs < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc)); else CHECK_ALLOC_QUERY(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hBRes(size_BRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potrs_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, hBRes, &max_error); // collect performance data if(argus.timing) potrs_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hBRes(size_BRes, 1, stBRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || nrhs == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_potrs(STRIDED, handle, uplo, n, nrhs, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) potrs_getError(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, hBRes, &max_error); // collect performance data if(argus.timing) potrs_getPerfData(handle, uplo, n, nrhs, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using m * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb", "strideA", "strideB", "batch_c"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb, stA, stB, bc); } else { rocsolver_bench_output("uplo", "n", "nrhs", "lda", "ldb"); rocsolver_bench_output(uploC, n, nrhs, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_POTRS(...) extern template void testing_potrs<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_POTRS, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, FOREACH_INT_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syev_heev.cpp000066400000000000000000000033201503202240500246110ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_syev_heev.hpp" #define TESTING_SYEV_HEEV(...) template void testing_syev_heev<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEV_HEEV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syev_heev.hpp000066400000000000000000000621601503202240500246250ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syev_heev_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, rocblas_evect(0), uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, rocblas_fill_full, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, 0), rocblas_status_success); } template void testing_syev_heev_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syev_heev_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syev_heev_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } } template void syev_heev_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syev_heev_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Th& hAres, Sh& hD, Sh& hDres, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE = 3 * n - 1; int lwork = (COMPLEX ? 2 * n - 1 : 0); std::vector work(lwork); std::vector hE(sizeE); std::vector A(lda * n * bc); // input data initialization syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); CHECK_HIP_ERROR(hDres.transfer_from(dD)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hAres.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, n, 1, hD[b], hDres[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDres[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hAres[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syev_heev_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Sh& hD, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE = 3 * n - 1; int lwork = (COMPLEX ? 2 * n - 1 : 0); std::vector work(lwork); std::vector hE(sizeE); std::vector A; if(!perf) { syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syev_heev_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syev_heev(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; if(argus.alg_mode == 1) { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid), rocblas_status_success); rocsolver_alg_mode alg_mode; EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode), rocblas_status_success); EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); } // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_D = n; size_t size_E = size_D; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hDres(size_Dres, 1, stD, bc); // device device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hAres(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syev_heev_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syev_heev_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hAres(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syev_heev(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syev_heev_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syev_heev_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, stE, bc); } else { rocsolver_bench_output("evect", "uplo", "n", "lda"); rocsolver_bench_output(evectC, uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEV_HEEV(...) \ extern template void testing_syev_heev<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEV_HEEV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevd_heevd.cpp000066400000000000000000000033301503202240500251220ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_syevd_heevd.hpp" #define TESTING_SYEVD_HEEVD(...) template void testing_syevd_heevd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVD_HEEVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevd_heevd.hpp000066400000000000000000000637501503202240500251430ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syevd_heevd_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, rocblas_evect(0), uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, rocblas_fill_full, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, 0), rocblas_status_success); } template void testing_syevd_heevd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevd_heevd_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevd_heevd_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); } } template void syevd_heevd_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevd_heevd_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Th& hAres, Sh& hD, Sh& hDres, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE, lwork; if(!COMPLEX) { sizeE = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { sizeE = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector hE(sizeE); std::vector iwork(liwork); std::vector A(lda * n * bc); // input data initialization syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); CHECK_HIP_ERROR(hDres.transfer_from(dD)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hAres.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syevd_heevd(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, iwork.data(), liwork, hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, n, 1, hD[b], hDres[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDres[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hAres[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevd_heevd_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Id& dinfo, const rocblas_int bc, Th& hA, Sh& hD, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int sizeE, lwork; if(!COMPLEX) { sizeE = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { sizeE = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector hE(sizeE); std::vector iwork(liwork); std::vector A; if(!perf) { syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevd_heevd(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, hE.data(), sizeE, iwork.data(), liwork, hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevd_heevd_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevd_heevd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; if(argus.alg_mode == 1) { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_sterf, rocsolver_alg_mode_hybrid), rocblas_status_success); rocsolver_alg_mode alg_mode; EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_sterf, &alg_mode), rocblas_status_success); EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); } // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_D = n; size_t size_E = size_D; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hDres(size_Dres, 1, stD, bc); // device device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hAres(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevd_heevd_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevd_heevd_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hAres(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevd_heevd(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevd_heevd_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevd_heevd_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dE, stE, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "strideE", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, stE, bc); } else { rocsolver_bench_output("evect", "uplo", "n", "lda"); rocsolver_bench_output(evectC, uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVD_HEEVD(...) \ extern template void testing_syevd_heevd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVD_HEEVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevdj_heevdj.cpp000066400000000000000000000033371503202240500254550ustar00rootroot00000000000000/* ************************************************************************ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ************************************************************************ */ #include "testing_syevdj_heevdj.hpp" #define TESTING_SYEVDJ_HEEVDJ(...) template void testing_syevdj_heevdj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVDJ_HEEVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevdj_heevdj.hpp000066400000000000000000000561321503202240500254630ustar00rootroot00000000000000/* ************************************************************************ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ************************************************************************ */ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syevdj_heevdj_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS( rocsolver_syevdj_heevdj(STRIDED, nullptr, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, rocblas_evect(0), uplo, n, dA, lda, stA, dD, stD, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, rocblas_fill_full, n, dA, lda, stA, dD, stD, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T) nullptr, lda, stA, dD, stD, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, (S) nullptr, stD, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA, lda, stA, dD, stD, (U) nullptr, 0), rocblas_status_success); } template void testing_syevdj_heevdj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdj_heevdj_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdj_heevdj_checkBadArgs(handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc); } } template void syevdj_heevdj_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevdj_heevdj_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Id& dinfo, const rocblas_int bc, Th& hA, Th& hAres, Sh& hD, Sh& hDres, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lwork = (COMPLEX ? 2 * n - 1 : 0); int lrwork = 3 * n - 1; std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc)); CHECK_HIP_ERROR(hDres.transfer_from(dD)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hAres.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hD[b], work.data(), lwork, rwork.data(), lrwork, hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, n, 1, hD[b], hDres[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hDres[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hAres[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hAres[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevdj_heevdj_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Id& dinfo, const rocblas_int bc, Th& hA, Sh& hD, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector A(lda * n * bc); if(!perf) { // cpu-lapack performance (only if not in perf mode) *cpu_time_used = nan(""); } syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevdj_heevdj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevdj_heevdj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_D = n; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Dres = (argus.unit_check || argus.norm_check) ? size_D : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); host_strided_batch_vector hDres(size_Dres, 1, stD, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hAres(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdj_heevdj_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdj_heevdj_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hAres(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdj_heevdj(STRIDED, handle, evect, uplo, n, dA.data(), lda, stA, dD.data(), stD, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdj_heevdj_getError(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc, hA, hAres, hD, hDres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdj_heevdj_getPerfData(handle, evect, uplo, n, dA, lda, stA, dD, stD, dinfo, bc, hA, hD, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideD", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stD, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "uplo", "n", "lda", "strideA", "strideD", "batch_c"); rocsolver_bench_output(evectC, uploC, n, lda, stA, stD, bc); } else { rocsolver_bench_output("evect", "uplo", "n", "lda"); rocsolver_bench_output(evectC, uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVDJ_HEEVDJ(...) \ extern template void testing_syevdj_heevdj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVDJ_HEEVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevdx_heevdx.cpp000066400000000000000000000033331503202240500255050ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_syevdx_heevdx.hpp" #define TESTING_SYEVDX_HEEVDX(...) template void testing_syevdx_heevdx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVDX_HEEVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevdx_heevdx.hpp000066400000000000000000000762741503202240500255300ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syevdx_heevdx_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS vl, const SS vu, const rocblas_int il, const rocblas_int iu, U dNev, S dW, const rocblas_stride stW, T dZ, const rocblas_int ldz, const rocblas_stride stZ, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, nullptr, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, rocblas_evect(0), erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, rocblas_erange(0), uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, rocblas_fill_full, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, (T) nullptr, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, (U) nullptr, dW, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, (S) nullptr, stW, dZ, ldz, stZ, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, (T) nullptr, ldz, stZ, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, 0, (T) nullptr, lda, stA, vl, vu, il, iu, dNev, (S) nullptr, stW, (T) nullptr, ldz, stZ, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, (U) nullptr, dW, stW, dZ, ldz, stZ, (U) nullptr, 0), rocblas_status_success); } template void testing_syevdx_heevdx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldz = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_stride stZ = 1; rocblas_int bc = 1; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dZ(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdx_heevdx_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdx_heevdx_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc); } } template void syevdx_heevdx_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // construct well conditioned matrix A such that all eigenvalues are in (-20, 20) for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevdx_heevdx_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNev, Sd& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Ih& hNevRes, Sh& hW, Sh& hWRes, Th& hZ, Th& hZRes, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A(lda * n * bc); std::vector hIfail(n); // input data initialization syevdx_heevdx_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc)); CHECK_HIP_ERROR(hNevRes.transfer_from(dNev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = 2 * get_safemin(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail.data(), hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // Check number of returned eigenvalues double err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hNev[b][0], hNevRes[b][0]) << "where b = " << b; if(hNev[b][0] != hNevRes[b][0]) err++; } *max_err = err > *max_err ? err : *max_err; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the nev eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hZRes[b] + j * ldz, 1, beta, hZ[b] + j * ldz, 1); } // error is ||hZ - hZRes|| / ||hZ|| // using frobenius norm err = norm_error('F', n, hNev[b][0], ldz, hZ[b], hZRes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevdx_heevdx_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Id& dNev, Sd& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Sh& hW, Th& hZ, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector A(lda * n * bc); if(!perf) { // cpu-lapack performance (only if not in perf mode) *cpu_time_used = nan(""); } syevdx_heevdx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevdx_heevdx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevdx_heevdx( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevdx_heevdx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevdx_heevdx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldz = argus.get("ldz", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideW", n); rocblas_stride stZ = argus.get("strideZ", ldz * n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_Z = size_t(ldz) * n; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check) ? size_Z : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || (evect != rocblas_evect_none && ldz < n) || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevdx_heevdx( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY( rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWres(size_WRes, 1, stW, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); // device device_strided_batch_vector dNev(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hZ(size_Z, 1, bc); host_batch_vector hZRes(size_ZRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dZ(size_Z, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdx_heevdx_getError(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc, hA, hNev, hNevRes, hW, hWres, hZ, hZRes, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdx_heevdx_getPerfData( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc, hA, hNev, hW, hZ, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hZ(size_Z, 1, stZ, bc); host_strided_batch_vector hZRes(size_ZRes, 1, stZ, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dZ(size_Z, 1, stZ, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdx_heevdx_getError(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc, hA, hNev, hNevRes, hW, hWres, hZ, hZRes, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdx_heevdx_getPerfData( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dinfo, bc, hA, hNev, hW, hZ, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 3 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 3 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "strideW", "ldz", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, stW, ldz, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "strideA", "vl", "vu", "il", "iu", "strideW", "ldz", "strideZ", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, stA, vl, vu, il, iu, stW, ldz, stZ, bc); } else { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "ldz"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, ldz); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVDX_HEEVDX(...) \ extern template void testing_syevdx_heevdx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVDX_HEEVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevdx_heevdx_inplace.hpp000066400000000000000000000755431503202240500272210ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syevdx_heevdx_inplace_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS vl, const SS vu, const rocblas_int il, const rocblas_int iu, const SS abstol, U hNev, S dW, const rocblas_stride stW, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, nullptr, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, rocblas_evect(0), erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, rocblas_erange(0), uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, rocblas_fill_full, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, (S) nullptr, stW, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNev, dW, stW, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, 0, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, hNev, (S) nullptr, stW, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, (U) nullptr, 0), rocblas_status_success); } template void testing_syevdx_heevdx_inplace_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_int bc = 1; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); host_strided_batch_vector hNev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdx_heevdx_inplace_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); host_strided_batch_vector hNev(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevdx_heevdx_inplace_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dinfo.data(), bc); } } template void syevdx_heevdx_inplace_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // construct well conditioned matrix A such that all eigenvalues are in (-20, 20) for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevdx_heevdx_inplace_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Ih& hNevRes, Sd& dW, const rocblas_stride stW, Id& dinfo, const rocblas_int bc, Th& hA, Th& hARes, Ih& hNev, Sh& hW, Sh& hWRes, Ih& hinfo, Ih& hinfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A(lda * n * bc); std::vector Z(ldz * n); std::vector ifail(n); // input data initialization syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // Check number of returned eigenvalues double err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hNev[b][0], hNevRes[b][0]) << "where b = " << b; if(hNev[b][0] != hNevRes[b][0]) err++; } *max_err = err > *max_err ? err : *max_err; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hinfo[b][0] == 0) err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hinfo[b][0] == 0) { // multiply A with each of the nev eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hZ - hZRes|| / ||hZ|| // using frobenius norm err = norm_error('F', n, hNev[b][0], lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevdx_heevdx_inplace_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Ih& hNevRes, Sd& dW, const rocblas_stride stW, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Sh& hW, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A; std::vector Z(ldz * n); std::vector ifail(n); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevdx_heevdx_inplace_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevdx_heevdx_inplace(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideW", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevdx_heevdx_inplace( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWres(size_WRes, 1, stW, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); // device device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdx_heevdx_inplace_getError( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hARes, hNev, hW, hWres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdx_heevdx_inplace_getPerfData( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hNev, hW, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevdx_heevdx_inplace(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevdx_heevdx_inplace_getError( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hARes, hNev, hW, hWres, hinfo, hinfoRes, &max_error); } // collect performance data if(argus.timing) { syevdx_heevdx_inplace_getPerfData( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, hNevRes, dW, stW, dinfo, bc, hA, hNev, hW, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 3 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 3 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol, stW, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "strideA", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, stA, vl, vu, il, iu, abstol, stW, bc); } else { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevj_heevj.cpp000066400000000000000000000033301503202240500251360ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_syevj_heevj.hpp" #define TESTING_SYEVJ_HEEVJ(...) template void testing_syevj_heevj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVJ_HEEVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevj_heevj.hpp000066400000000000000000000732451503202240500251570ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syevj_heevj_checkBadArgs(const rocblas_handle handle, const rocblas_esort esort, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS abstol, S dResidual, const rocblas_int max_sweeps, U dSweeps, S dW, const rocblas_stride stW, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, nullptr, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, rocblas_esort(0), evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, rocblas_evect(0), uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, rocblas_fill_full, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, (U) nullptr, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, 0, (T) nullptr, lda, stA, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA, lda, stA, abstol, (S) nullptr, max_sweeps, (U) nullptr, dW, stW, (U) nullptr, 0), rocblas_status_success); } template void testing_syevj_heevj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_none; rocblas_esort esort = rocblas_esort_ascending; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_int bc = 1; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments syevj_heevj_checkBadArgs(handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments syevj_heevj_checkBadArgs(handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } } template void syevj_heevj_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; else hA[b][i + j * lda] -= 4; } } // make copy of original data to test vectors if required if(test) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevj_heevj_getError(const rocblas_handle handle, const rocblas_esort esort, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Sh& hResidualRes, Ih& hSweepsRes, Sh& hW, Sh& hWRes, Ih& hInfo, Ih& hInfoRes, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; S atol = (abstol <= 0) ? get_epsilon() : abstol; int lwork = (COMPLEX ? 2 * n - 1 : 0); int lrwork = 3 * n - 1; std::vector work(lwork); std::vector rwork(lrwork); std::vector A(lda * n * bc); // input data initialization syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect == rocblas_evect_original) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hW[b], work.data(), lwork, rwork.data(), lrwork, hInfo[b]); // (We expect the used input matrices to always converge) // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfoRes[b][0], 0) << "where b = " << b; if(hInfoRes[b][0] != 0) *max_err += 1; } // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) { EXPECT_GE(hResidualRes[b][0], 0) << "where b = " << b; if(hResidualRes[b][0] < 0) *max_err += 1; else { S threshold = snorm('F', n, n, A.data() + b * lda * n, lda) * atol; EXPECT_LE(hResidualRes[b][0], threshold) << "where b = " << b; if(hResidualRes[b][0] > threshold) *max_err += 1; } } // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) { EXPECT_GE(hSweepsRes[b][0], 0) << "where b = " << b; EXPECT_LE(hSweepsRes[b][0], max_sweeps) << "where b = " << b; if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; } double err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK // (no need to test the non-sorted case --lapack return sorted eigenvalues--) // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfo[b][0] == 0 && esort == rocblas_esort_ascending) err = norm_error('F', 1, n, 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfo[b][0] == 0) { // multiply A with each of the n eigenvectors and divide by corresponding // eigenvalues T alpha; T beta = 0; for(int j = 0; j < n; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A.data() + b * lda * n, lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void syevj_heevj_getPerfData(const rocblas_handle handle, const rocblas_esort esort, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Sh& hW, Ih& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n - 1 : 0); int lrwork = 3 * n - 1; std::vector work(lwork); std::vector rwork(lrwork); std::vector A; if(!perf) { syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syev_heev(evect, uplo, n, hA[b], lda, hW[b], work.data(), lwork, rwork.data(), lrwork, hInfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevj_heevj_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevj_heevj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char esortC = argus.get("esort"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideD", n); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_esort esort = char2rocblas_esort(esortC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_Ares = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_Wres = (argus.unit_check || argus.norm_check) ? size_W : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, (T* const*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevj_heevj( STRIDED, handle, esort, evect, uplo, n, (T*)nullptr, lda, stA, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); host_strided_batch_vector hWRes(size_Wres, 1, stW, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_Ares, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevj_heevj_getError(handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error); } // collect performance data if(argus.timing) { syevj_heevj_getPerfData( handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_Ares, 1, stA, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevj_heevj(STRIDED, handle, esort, evect, uplo, n, dA.data(), lda, stA, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) { syevj_heevj_getError(handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error); } // collect performance data if(argus.timing) { syevj_heevj_getPerfData( handle, esort, evect, uplo, n, dA, lda, stA, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("esort", "evect", "uplo", "n", "lda", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(esortC, evectC, uploC, n, lda, abstol, max_sweeps, stW, bc); } else if(STRIDED) { rocsolver_bench_output("esort", "evect", "uplo", "n", "lda", "strideA", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(esortC, evectC, uploC, n, lda, stA, abstol, max_sweeps, stW, bc); } else { rocsolver_bench_output("esort", "evect", "uplo", "n", "lda", "abstol", "max_sweeps"); rocsolver_bench_output(esortC, evectC, uploC, n, lda, abstol, max_sweeps); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVJ_HEEVJ(...) \ extern template void testing_syevj_heevj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVJ_HEEVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevx_heevx.cpp000066400000000000000000000033301503202240500251720ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_syevx_heevx.hpp" #define TESTING_SYEVX_HEEVX(...) template void testing_syevx_heevx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYEVX_HEEVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_syevx_heevx.hpp000066400000000000000000001141641503202240500252070ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/matrix_utils/matrix_utils.hpp" #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void syevx_heevx_checkBadArgs(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, const SS vl, const SS vu, const rocblas_int il, const rocblas_int iu, const SS abstol, U dNev, S dW, const rocblas_stride stW, T dZ, const rocblas_int ldz, const rocblas_stride stZ, U dIfail, const rocblas_stride stF, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, nullptr, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, rocblas_evect(0), erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, rocblas_erange(0), uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, rocblas_fill_full, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, (S) nullptr, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, (T) nullptr, ldz, stZ, dIfail, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, (U) nullptr, stF, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, 0, (T) nullptr, lda, stA, vl, vu, il, iu, abstol, dNev, (S) nullptr, stW, (T) nullptr, ldz, stZ, (U) nullptr, stF, dinfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, (U) nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, (U) nullptr, 0), rocblas_status_success); } template void testing_syevx_heevx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_lower; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldz = 1; rocblas_stride stA = 1; rocblas_stride stW = 1; rocblas_stride stZ = 1; rocblas_stride stF = 1; rocblas_int bc = 1; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dZ(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevx_heevx_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dinfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); // check bad arguments syevx_heevx_checkBadArgs(handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc); } } template void syevx_heevx_initData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, std::vector& A, bool test = true) { if(CPU) { rocblas_init(hA, true); // construct well conditioned matrix A such that all eigenvalues are in (-20, 20) #ifdef ROCSOLVER_TESTS_USE_DEPRECATED_INITIALIZERS // Old matrix initialization for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = i; j < n; j++) { if(i == j) hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } #else // New matrix initialization using HMat = HostMatrix; using BDesc = typename HMat::BlockDescriptor; for(rocblas_int b = 0; b < bc; ++b) { auto hAw = HMat::Wrap(hA[b], lda, n); if(hAw) // update matrix hA if n >= 1 { rocblas_int half_n = n / 2; // create half_n eigenvalues from -20 to -1, and n - half_n eigenvalues from 1 to 20 // (for the time being, avoid using -20 and 20, otherwise some tests will fail with hNev != hNevRes) auto eigs = cat(HMat::FromRange(-20.1, -1.0, half_n), HMat::FromRange(1.0, 20.1, n - half_n)); auto [Q, _] = qr((*hAw).block(BDesc().nrows(n).ncols(n))); hAw->set_to_zero(); hAw->copy_data_from(Q * HMat::Zeros(n).diag(eigs) * adjoint(Q)); } #endif // make copy of original data to test vectors if required if(test && evect == rocblas_evect_original) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) A[b * lda * n + i + j * lda] = hA[b][i + j * lda]; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void syevx_heevx_getError(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Id& dNev, Sd& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Id& dIfail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Ih& hNevRes, Sh& hW, Sh& hWRes, Th& hZ, Th& hZRes, Ih& hIfail, Ih& hIfailRes, Ih& hinfo, Ih& hinfoRes, double* max_err, size_t& hashA, size_t& hashW, size_t& hashZ) { using HMat = HostMatrix; using BDesc = typename HMat::BlockDescriptor; constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A(lda * n * bc); // input data initialization syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A); // hash inputs hashA = deterministic_hash(hA, bc); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc)); CHECK_HIP_ERROR(hNevRes.transfer_from(dNev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hinfoRes.transfer_from(dinfo)); if(evect == rocblas_evect_original) { CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); } // hash outputs hashW = deterministic_hash(hWRes, bc); if(evect == rocblas_evect_original) hashZ = deterministic_hash(hZRes, bc); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hinfo[b]); // Check info for non-convergence *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hinfo[b][0], hinfoRes[b][0]) << "where b = " << b; if(hinfo[b][0] != hinfoRes[b][0]) *max_err += 1; } // Check number of returned eigenvalues double err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hNev[b][0], hNevRes[b][0]) << "where b = " << b; if(hNev[b][0] != hNevRes[b][0]) err++; } *max_err = err > *max_err ? err : *max_err; // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved). for(rocblas_int b = 0; b < bc; ++b) { // Number of eigenvalues auto num_eigs = hNev[b][0]; if((hinfo[b][0] != 0) || (num_eigs <= 0)) { if(evect == rocblas_evect_original) { // check ifail err = 0; for(int j = 0; j < hinfo[b][0]; j++) { EXPECT_NE(hIfailRes[b][j], 0) << "where b = " << b << ", j = " << j; if(hIfailRes[b][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } continue; } // Get reference eigenvalues (will be updated in a subsequent commit) auto eigs_ref = *HMat::Convert(hW[b], 1, num_eigs); // Get computed eigenvalues auto eigs_b = *HMat::Convert( hWRes[b], 1, num_eigs); // convert eigenvalues from type S to type T, if required if(evect != rocblas_evect_original) { // only eigenvalues needed; can compare with LAPACK err = (eigs_ref - eigs_b).max_coeff_norm() / eigs_ref.max_coeff_norm(); *max_err = err > *max_err ? err : *max_err; } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling // check ifail err = 0; for(int j = 0; j < hNev[b][0]; j++) { EXPECT_EQ(hIfailRes[b][j], 0) << "where b = " << b << ", j = " << j; if(hIfailRes[b][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; // Create a thin wrapper of input matrix A (bc * lda * n), of size lda * n starting at b * lda * n auto AWrap_b = HMat::Wrap(A.data() + b * lda * n, lda, n); // Since `lda`, `ldz` and `n` can differ, we must extract submatrices of the correct size from // `A`, `hZRes` and `hWRes`. // // We want the sub-block starting from row 0, col 0 and with size n x n of A auto block_A = BDesc().from_row(0).from_col(0).nrows(n).ncols( n); // the `from_row(0)` and `from_col(0)` calls can be omitted auto A_b = (*AWrap_b).block(block_A); // Get computed eigenvectors auto V_b = (*HMat::Wrap(hZRes[b], ldz, n)).block(BDesc().nrows(n).ncols(num_eigs)); // Check orthogonality of computed eigenvectors auto OE = adjoint(V_b) * V_b - HMat::Eye(num_eigs); S ortho_err = OE.norm(); *max_err = ortho_err > *max_err ? ortho_err : *max_err; // Check accuracy of eigenpairs auto AE = adjoint(V_b) * A_b * V_b - HMat::Zeros(num_eigs).diag(eigs_b); err = AE.max_col_norm() / eigs_ref.max_coeff_norm(); /* err *= std::numeric_limits::epsilon() / ortho_err; // Use "relative Weyl" error bound */ *max_err = err > *max_err ? err : *max_err; } } } template void syevx_heevx_getPerfData(const rocblas_handle handle, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Id& dNev, Sd& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Id& dIfail, const rocblas_stride stF, Id& dinfo, const rocblas_int bc, Th& hA, Ih& hNev, Sh& hW, Th& hZ, Ih& hIfail, Ih& hinfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = !COMPLEX ? 35 * n : 33 * n; int lrwork = !COMPLEX ? 0 : 7 * n; int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector A; // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) cpu_syevx_heevx(evect, erange, uplo, n, hA[b], lda, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hinfo[b]); *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); // cold calls for(int iter = 0; iter < 2; iter++) { syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); CHECK_ROCBLAS_ERROR(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { syevx_heevx_initData(handle, evect, n, dA, lda, bc, hA, A, 0); start = get_time_us_sync(stream); rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_syevx_heevx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldz = argus.get("ldz", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stW = argus.get("strideW", n); rocblas_stride stZ = argus.get("strideZ", ldz * n); rocblas_stride stF = argus.get("strideF", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_W = n; size_t size_Z = size_t(ldz) * n; size_t size_ifail = n; size_t size_WRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_W : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_Z : 0; size_t size_ifailRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_ifail : 0; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t hashA = 0, hashW = 0, hashZ = 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || (evect != rocblas_evect_none && ldz < n) || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_syevx_heevx( STRIDED, handle, evect, erange, uplo, n, (T*)nullptr, lda, stA, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWres(size_WRes, 1, stW, bc); host_strided_batch_vector hIfail(size_ifail, 1, stF, bc); host_strided_batch_vector hIfailRes(size_ifailRes, 1, stF, bc); host_strided_batch_vector hinfo(1, 1, 1, bc); host_strided_batch_vector hinfoRes(1, 1, 1, bc); // device device_strided_batch_vector dNev(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dIfail(size_ifail, 1, stF, bc); device_strided_batch_vector dinfo(1, 1, 1, bc); CHECK_HIP_ERROR(dNev.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(size_ifail) CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dinfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hZ(size_Z, 1, bc); host_batch_vector hZRes(size_ZRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dZ(size_Z, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) { syevx_heevx_getError( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hNevRes, hW, hWres, hZ, hZRes, hIfail, hIfailRes, hinfo, hinfoRes, &max_error, hashA, hashW, hashZ); } // collect performance data if(argus.timing) { syevx_heevx_getPerfData(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hW, hZ, hIfail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hZ(size_Z, 1, stZ, bc); host_strided_batch_vector hZRes(size_ZRes, 1, stZ, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dZ(size_Z, 1, stZ, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_syevx_heevx(STRIDED, handle, evect, erange, uplo, n, dA.data(), lda, stA, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dinfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) { syevx_heevx_getError( handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hNevRes, hW, hWres, hZ, hZRes, hIfail, hIfailRes, hinfo, hinfoRes, &max_error, hashA, hashW, hashZ); } // collect performance data if(argus.timing) { syevx_heevx_getPerfData(handle, evect, erange, uplo, n, dA, lda, stA, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dinfo, bc, hA, hNev, hW, hZ, hIfail, hinfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideF", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol, stW, ldz, stF, bc); } else if(STRIDED) { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "strideA", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideZ", "strideF", "batch_c"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, stA, vl, vu, il, iu, abstol, stW, ldz, stZ, stF, bc); } else { rocsolver_bench_output("evect", "erange", "uplo", "n", "lda", "vl", "vu", "il", "iu", "abstol", "ldz"); rocsolver_bench_output(evectC, erangeC, uploC, n, lda, vl, vu, il, iu, abstol, ldz); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); if(argus.hash_check) { rocsolver_bench_output("hash(A)", "hash(W)", "hash(Z)"); rocsolver_bench_output(ROCSOLVER_FORMAT_HASH(hashA), ROCSOLVER_FORMAT_HASH(hashW), ROCSOLVER_FORMAT_HASH(hashZ)); rocsolver_bench_endl(); } } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYEVX_HEEVX(...) \ extern template void testing_syevx_heevx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYEVX_HEEVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygsx_hegsx.cpp000066400000000000000000000034411503202240500251730ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sygsx_hegsx.hpp" #define TESTING_SYGSX_HEGSX(...) template void testing_sygsx_hegsx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGSX_HEGSX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygsx_hegsx.hpp000066400000000000000000000575671503202240500252220ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygsx_hegsx_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, nullptr, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, rocblas_eform(0), uplo, n, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, 0, (T) nullptr, lda, stA, dB, ldb, stB, bc), rocblas_status_success); EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, 0, dA, lda, stA, (T) nullptr, ldb, stB, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, 0), rocblas_status_success); } template void testing_sygsx_hegsx_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_eform itype = rocblas_eform_ax; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int ldb = 1; rocblas_stride stB = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments sygsx_hegsx_checkBadArgs(handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); // check bad arguments sygsx_hegsx_checkBadArgs(handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc); } } template void sygsx_hegsx_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& M, const bool test) { if(CPU) { rocblas_int info; const rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with the reduced matrix M of the standard equivalent problem. // Then we construct the generalized pair (A, B) from there for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) or hB = U' for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 100; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; if(uplo == rocblas_fill_upper) { hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; } else { hB[b][j + i * ldb] = sconj(U[b][i + j * ldu]); hB[b][i + j * ldb] = 0; } } } } // store M = hA for implicit testing if(test) { for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) M[b][i + j * lda] = hA[b][i + j * lda]; } T one = T(1); if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygsx_hegsx_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, double* max_err) { constexpr bool VERIFY_IMPLICIT_TEST = false; host_strided_batch_vector M(lda * n, 1, lda * n, bc); // input data initialization sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, true); // execute computations // use verify_implicit_test to check correctness of the implicit test using // CPU lapack if(!VERIFY_IMPLICIT_TEST) { // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); } else { // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { memcpy(hARes[b], hA[b], lda * n * sizeof(T)); SYGST ? cpu_sygst_hegst(itype, uplo, n, hARes[b], lda, hB[b], ldb) : cpu_sygs2_hegs2(itype, uplo, n, hARes[b], lda, hB[b], ldb); } } // error is ||M - hARes|| / ||M|| // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { if(uplo == rocblas_fill_upper) err = norm_error_upperTr('F', n, n, lda, M[b], hARes[b]); else err = norm_error_lowerTr('F', n, n, lda, M[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } template void sygsx_hegsx_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { host_strided_batch_vector M(lda * n, 1, lda * n, bc); if(!perf) { sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { SYGST ? cpu_sygst_hegst(itype, uplo, n, hA[b], lda, hB[b], ldb) : cpu_sygs2_hegs2(itype, uplo, n, hA[b], lda, hB[b], ldb); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); // cold calls for(int iter = 0; iter < 2; iter++) { sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); CHECK_ROCBLAS_ERROR(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygsx_hegsx_initData(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, M, false); start = get_time_us_sync(stream); rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygsx_hegsx(Arguments& argus) { // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stBRes = (argus.unit_check || argus.norm_check) ? stB : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygsx_hegsx_getError(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hARes, hB, &max_error); // collect performance data if(argus.timing) sygsx_hegsx_getPerfData( handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygsx_hegsx(STRIDED, SYGST, handle, itype, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygsx_hegsx_getError(handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hARes, hB, &max_error); // collect performance data if(argus.timing) sygsx_hegsx_getPerfData( handle, itype, uplo, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "uplo", "n", "lda", "ldb", "batch_c"); rocsolver_bench_output(itypeC, uploC, n, lda, ldb, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "uplo", "n", "lda", "strideA", "ldb", "strideB", "batch_c"); rocsolver_bench_output(itypeC, uploC, n, lda, stA, ldb, stB, bc); } else { rocsolver_bench_output("itype", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGSX_HEGSX(...) \ extern template void testing_sygsx_hegsx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGSX_HEGSX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygv_hegv.cpp000066400000000000000000000033201503202240500246150ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sygv_hegv.hpp" #define TESTING_SYGV_HEGV(...) template void testing_sygv_hegv<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGV_HEGV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygv_hegv.hpp000066400000000000000000001000531503202240500246230ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygv_hegv_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dD, const rocblas_stride stD, U dE, const rocblas_stride stE, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, (U) nullptr, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (U) nullptr, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygv_hegv_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygv_hegv_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygv_hegv_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } } template void sygv_hegv_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygv_hegv_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Uh& hD, Uh& hDRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; } double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hD[b], hDRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygv_hegv_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hD, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygv_hegv_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygv_hegv(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_D = size_t(n); size_t size_E = size_D; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hDRes(size_DRes, 1, stDRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygv_hegv_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygv_hegv_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygv_hegv(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygv_hegv_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygv_hegv_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, stE, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGV_HEGV(...) \ extern template void testing_sygv_hegv<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGV_HEGV, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvd_hegvd.cpp000066400000000000000000000033301503202240500251260ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sygvd_hegvd.hpp" #define TESTING_SYGVD_HEGVD(...) template void testing_sygvd_hegvd<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVD_HEGVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvd_hegvd.hpp000066400000000000000000001020021503202240500251270ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygvd_hegvd_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dD, const rocblas_stride stD, U dE, const rocblas_stride stE, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, dD, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, (U) nullptr, stD, dE, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (U) nullptr, stD, (U) nullptr, stE, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvd_hegvd_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvd_hegvd_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvd_hegvd_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); } } template void sygvd_hegvd_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvd_hegvd_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Uh& hD, Uh& hDRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lrwork, lwork; if(!COMPLEX) { lrwork = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { lrwork = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvd_hegvd(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; } double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hD[b], hDRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvd_hegvd_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Ud& dE, const rocblas_stride stE, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hD, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); int lrwork, lwork; if(!COMPLEX) { lrwork = (evect == rocblas_evect_none ? 2 * n + 1 : 1 + 6 * n + 2 * n * n); lwork = 0; } else { lrwork = (evect == rocblas_evect_none ? n : 1 + 5 * n + 2 * n * n); lwork = (evect == rocblas_evect_none ? n + 1 : 2 * n + n * n); } int liwork = (evect == rocblas_evect_none ? 1 : 3 + 5 * n); std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvd_hegvd(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvd_hegvd_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvd_hegvd(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_D = size_t(n); size_t size_E = size_D; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvd_hegvd( STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (S*)nullptr, stE, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hDRes(size_DRes, 1, stDRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvd_hegvd_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvd_hegvd_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvd_hegvd(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dE.data(), stE, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvd_hegvd_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvd_hegvd_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dE, stE, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, stE, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "strideD", "strideE", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, stE, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVD_HEGVD(...) \ extern template void testing_sygvd_hegvd<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVD_HEGVD, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvdj_hegvdj.cpp000066400000000000000000000033371503202240500254610ustar00rootroot00000000000000/* ************************************************************************ * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ************************************************************************ */ #include "testing_sygvdj_hegvdj.hpp" #define TESTING_SYGVDJ_HEGVDJ(...) template void testing_sygvdj_hegvdj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVDJ_HEGVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvdj_hegvdj.hpp000066400000000000000000000747131503202240500254740ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygvdj_hegvdj_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, U dD, const rocblas_stride stD, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, dD, stD, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, (U) nullptr, stD, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, (U) nullptr, stD, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvdj_hegvdj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stD = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdj_hegvdj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdj_hegvdj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc); } } template void sygvdj_hegvdj_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvdj_hegvdj_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Uh& hD, Uh& hDRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; using S = decltype(std::real(T{})); rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc)); CHECK_HIP_ERROR(hDRes.transfer_from(dD)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hD[b], work.data(), lwork, rwork.data(), hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; } double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hD - hDRes|| / ||hD|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hD[b], hDRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hDRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvdj_hegvdj_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, Ud& dD, const rocblas_stride stD, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Uh& hD, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { // cpu-lapack performance (only if not in perf mode) *cpu_time_used = nan(""); } sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvdj_hegvdj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvdj_hegvdj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stD = argus.get("strideD", n); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stDRes = (argus.unit_check || argus.norm_check) ? stD : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_D = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_DRes = (argus.unit_check || argus.norm_check) ? size_D : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvdj_hegvdj( STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, (S*)nullptr, stD, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hDRes(size_DRes, 1, stDRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvdj_hegvdj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvdj_hegvdj_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvdj_hegvdj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, dD.data(), stD, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvdj_hegvdj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA, hARes, hB, hD, hDRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvdj_hegvdj_getPerfData( handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, dD, stD, dInfo, bc, hA, hB, hD, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideD", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stD, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "strideD", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, stD, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVDJ_HEGVDJ(...) \ extern template void testing_sygvdj_hegvdj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVDJ_HEGVDJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvdx_hegvdx.cpp000066400000000000000000000033331503202240500255110ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sygvdx_hegvdx.hpp" #define TESTING_SYGVDX_HEGVDX(...) template void testing_sygvdx_hegvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVDX_HEGVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvdx_hegvdx.hpp000066400000000000000000001462651503202240500255320ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/matrix_utils/matrix_utils.hpp" #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/clss.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygvdx_hegvdx_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, rocblas_int* dNev, U dW, const rocblas_stride stW, T dZ, const rocblas_int ldz, const rocblas_stride stZ, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, nullptr, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, rocblas_eform(0), evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, rocblas_evect_tridiagonal, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, rocblas_erange(0), uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, dW, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, (U) nullptr, stW, dZ, ldz, stZ, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, (T) nullptr, ldz, stZ, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, dNev, (U) nullptr, stW, (T) nullptr, ldz, stZ, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, dW, stW, dZ, ldz, stZ, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvdx_hegvdx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldz = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_stride stZ = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_upper; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dZ(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdx_hegvdx_checkBadArgs(handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdx_hegvdx_checkBadArgs(handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc); } } // // If the environment variable: // // ROCSOLVER_SYGVDX_HEGVDX_USE_LEGACY_TESTS // // is defined, `sygvdx_hegvdx_getError` will compute errors using the // legacy error bounds (for debugging purposes). // // Otherwise the new error bounds are always used. // static bool sygvdx_hegvdx_use_legacy_tests() { bool status = false; if(std::getenv("ROCSOLVER_SYGVDX_HEGVDX_USE_LEGACY_TESTS") != nullptr) { status = true; } return status; } // // The default behaviour of `sygvdx_hegvdx_getError()` is to check if the // number of computed eigenvalues match the number of reference eigenvalues, // and then to check all computed eigenvalues for their accuracy, but this // behaviour can be relaxed. This leads to two modes of operation: a relaxed // check and a full (default) check. Those are controlled by function // `test_for_equality_of_number_of_computed_eigenvalues()`, below, in the // following manner: // // a) If `ROCSOLVER_LAX_EIGENSOLVERS_TESTS` is defined, then the test suite // will only use the subset of computed eigenvalues that match reference // eigenvalues (up to the given tolerance); except // // b) If `ROCSOLVER_FULL_EIGENSOLVERS_TESTS` is defined, then the test suite // will unconditionally check all eigenvalues for their accuracy. // // The relaxed tests are intended as a means to decouple the computation of // error bounds of eigenvalues and eigenvectors, allowing tests to pass in the // case that not all eigenvalues could be accurately computed, but all accurate // eigenvalues have accurate eigenvectors. If eigenvectors are not accurate, // the corresponding tests will fail both in full mode and in relaxed mode. // // Note: the relaxed version of the tests is only supported when using the new // error bounds, see also function `sygvdx_hegvdx_use_legacy_tests()`. // static bool test_for_equality_of_number_of_computed_eigenvalues() { bool status = true; #if defined(ROCSOLVER_LAX_EIGENSOLVERS_TESTS) status = false; #else if(std::getenv("ROCSOLVER_LAX_EIGENSOLVERS_TESTS") != nullptr) { status = false; } #endif if(std::getenv("ROCSOLVER_FULL_EIGENSOLVERS_TESTS") != nullptr) { status = true; } return status; } template void sygvdx_hegvdx_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); bool use_legacy_tests = sygvdx_hegvdx_use_legacy_tests(); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with a reduced matrix M for the standard equivalent problem // with spectrum in a desired range (-20, 20). Then we construct the generalized pair // (A, B) from there. memset(hB[b], 0, sizeof(T) * n * ldb); // since ldb >= n, make sure all entries of B are initialized for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; U[b][j + i * ldu] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // form B = U' U T one = T(1); cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hB[b], ldb); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(use_legacy_tests) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } else { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvdx_hegvdx_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Vd& dNev, Ud& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Vh& hNevRes, Uh& hW, Uh& hWRes, Th& hZ, Th& hZRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular, size_t& hashA, size_t& hashB, size_t& hashW, size_t& hashZ) { using HMat = HostMatrix; using BDesc = typename HMat::BlockDescriptor; constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); std::vector hIfail(n); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); std::vector> clss(bc); std::vector skip_test(bc, false); bool use_legacy_tests = sygvdx_hegvdx_use_legacy_tests(); bool test_for_equality = test_for_equality_of_number_of_computed_eigenvalues(); // input data initialization sygvdx_hegvdx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // hash inputs hashA = deterministic_hash(hA, bc); hashB = deterministic_hash(hB, bc); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = 2 * get_safemin(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail.data(), hInfo[b]); // Capture failures where B is not positive definite (hInfo[b][0] > n), // or where the i-argument has an illegal value (hInfo[b][0] < 0). All other LAPACK // failures skip the test. if((hInfo[b][0] > 0) && (hInfo[b][0] <= n)) { skip_test[b] = true; } } // // Given an eigenvalue l_i of the symmetric matrix A and a computed // eigenvalue l_i^* (obtained with a backward stable method), Weyl's // theorem yields |l_i - l_i^*| <= K*ulp*||A||_2, where K depends on n. // For the sake of this test, we will set K = C * n, with C ~ 1. // // Thus, if the range to look for eigenvalues is the interval (vl, vu], // calls to the solver should look for computed eigenvalues in the range // (vl - tol, vu + tol], where `tol = C * n * ulp * ||A||`. // S C = 4; std::vector tols(bc, 0); std::vector norms(bc, 0); S tol = 0; for(rocblas_int b = 0; b < bc; ++b) { if(hNev[b][0] > 0) { // Get lapack eigenvalues (reference to which rocSOLVER's sygvdx will be compared to) auto eigsLapack = *HMat::Convert(hW[b], hNev[b][0], 1); norms[b] = eigsLapack.max_coeff_norm(); } else { norms[b] = S(0); } tols[b] = C * n * std::numeric_limits::epsilon() * norms[b]; if(std::isfinite(tols[b]) && (tols[b] > tol)) { tol = tols[b]; } } // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc)); CHECK_HIP_ERROR(hNevRes.transfer_from(dNev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); // hash outputs hashW = deterministic_hash(hWRes, bc); if(evect != rocblas_evect_none) hashZ = deterministic_hash(hZRes, bc); // Except for the cases in which B is indefinite, we expect the eigensolver // to converge for all input matrices. // check info for illegal values and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { // Capture failures where B is not positive definite (hInfo[b][0] > n), // or where the i-argument has an illegal value (hInfo[b][0] < 0). All other LAPACK // failures skip the test. if(skip_test[b]) continue; EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; auto numMatchingEigs = clss[b](hW[b], hNev[b][0], hWRes[b], hNevRes[b][0], tols[b]); if(test_for_equality) { EXPECT_EQ(hNev[b][0], numMatchingEigs) << "where b = " << b; if(hNev[b][0] != numMatchingEigs) *max_err += 1; } } // // Compute errors // double err; for(rocblas_int b = 0; b < bc; ++b) { auto [lapackEigs, rocsolverEigs] = clss[b].subseqs(); auto [_, rocsolverEigsIds] = clss[b].subseqs_ids(); auto numMatchingEigs = rocsolverEigs.size(); // Number of eigenvalues computed by rocSOLVER auto numRocsolverEigs = hNevRes[b][0]; // Only check accuracy for tests in which both computed and reference values exist and are well defined. if(skip_test[b] || (numMatchingEigs == 0) || (hInfo[b][0] != 0)) continue; if(evect == rocblas_evect_none) { // // Only eigenvalues // if(use_legacy_tests) { err = norm_error('F', 1, numMatchingEigs, 1, lapackEigs.data(), rocsolverEigs.data()); *max_err = err > *max_err ? err : *max_err; } else { // Get computed eigenvalues auto eigs = *HMat::Convert(rocsolverEigs.data(), rocsolverEigs.size(), 1); // convert eigenvalues from type S to type T, if required // Get lapack (reference) eigenvalues auto eigsRef = *HMat::Convert(lapackEigs.data(), lapackEigs.size(), 1); // convert eigenvalues from type S to type T, if required err = (eigs - eigsRef).norm() / eigsRef.norm(); *max_err = err > *max_err ? err : *max_err; } } else { // // Both eigenvalues and eigenvectors // if(use_legacy_tests) { T alpha = 1; T beta = 0; // hZRes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, numRocsolverEigs, alpha, B[b], ldb, hZRes[b], ldz, beta, hB[b], ldb); auto [_, hWResIds] = clss[b].subseqs_ids(); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < numMatchingEigs; j++) { int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair alpha = T(1) / hWRes[b][jj]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hZRes[b] + jj * ldz, 1, beta, hA[b] + j * lda, 1); } // move B*x into hZRes for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < numMatchingEigs; j++) { int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair hZRes[b][i + j * ldz] = hB[b][i + jj * ldb]; } } } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < numMatchingEigs; j++) { int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair alpha = T(1) / hWRes[b][jj]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + jj * ldb, 1, beta, hA[b] + j * lda, 1); } // move hZRes for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < numMatchingEigs; j++) { int jj = hWResIds[j]; // Id of rocSOLVER eigen-pair associated to j-th LAPACK eigen-pair if(j != jj) hZRes[b][i + j * ldz] = hZRes[b][i + jj * ldz]; } } } // error is ||hA - hZRes|| / ||hA|| // using frobenius norm err = norm_error('F', n, numMatchingEigs, lda, hA[b], hZRes[b], ldz); *max_err = err > *max_err ? err : *max_err; } else // if(!use_legacy_tests) { // // Prepare input // // Get computed eigenvalues auto eigs = *HMat::Convert(rocsolverEigs.data(), rocsolverEigs.size(), 1); // convert eigenvalues from type S to type T, if required // Get lapack (reference) eigenvalues auto eigsRef = *HMat::Convert(lapackEigs.data(), lapackEigs.size(), 1); // convert eigenvalues from type S to type T, if required // Create thin wrappers of input matrices A and B auto AWrap = HMat::Wrap(A.data() + b * lda * n, lda, n); auto BWrap = HMat::Wrap(B.data() + b * ldb * n, ldb, n); // We want the sub-blocks starting from row 0, col 0 and with size n x n of A and B auto A_b = (*AWrap).block(BDesc().nrows(n).ncols(n)); auto B_b = (*BWrap).block(BDesc().nrows(n).ncols(n)); // Get computed eigenvectors auto V_b = (*HMat::Wrap(hZRes[b], ldz, n)).block(BDesc().nrows(n).ncols(numRocsolverEigs)); // If rocSOLVER computed more eigen-pairs then the number of // reference eigenvalues, select the eigen-pairs that match the // reference if(numRocsolverEigs > numMatchingEigs) { rocblas_int ii; for(rocblas_int i = 0; i < numMatchingEigs; ++i) { ii = rocsolverEigsIds[i]; V_b.col(i, V_b.col(ii)); } V_b = V_b.block(BDesc().nrows(n).ncols(numMatchingEigs)); } // // Check eigenpairs' accuracy with a "Relative Weyl" error // bound, which (at its simplest form) states the following. // // Let X (cond(X) < Inf), and A (A^* = A) be such that A has // eigenvalues {a_i} and H = X^t*A*X has eigenvalues {h_i}. // Then: // // |a_i - h_i| <= |a_i|*||X^t*X - I||_2 // // Note: for rocSOLVER's sygv, if V is the eigenvectors' matrix // and B = L*L^t, then either X = L^t*V (cases 1 and 2) or X = // inv(L)*V (case 3). // auto VE = HMat::Empty(); if(itype == rocblas_eform_bax) { VE = adjoint(V_b) * inv(B_b) * V_b - HMat::Eye(numMatchingEigs); } else // if ((itype == rocblas_eform_ax) || (itype == rocblas_eform_abx)) { VE = adjoint(V_b) * B_b * V_b - HMat::Eye(numMatchingEigs); } S eta = std::max(VE.norm(), std::numeric_limits::epsilon()); *max_err = eta > *max_err ? eta : *max_err; auto AE = HMat::Empty(); if(itype == rocblas_eform_abx) { auto Z = B_b * V_b; AE = adjoint(Z) * A_b * Z - HMat::Zeros(numMatchingEigs).diag(eigs); } else // if ((itype == rocblas_eform_ax) || (itype == rocblas_eform_bax)) { AE = adjoint(V_b) * A_b * V_b - HMat::Zeros(numMatchingEigs).diag(eigs); } err = AE.norm() / eigsRef.norm(); err *= std::numeric_limits::epsilon() / eta; *max_err = err > *max_err ? err : *max_err; } } } } template void sygvdx_hegvdx_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, Vd& dNev, Ud& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Uh& hW, Th& hZ, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); if(!perf) { // cpu-lapack performance (only if not in perf mode) *cpu_time_used = nan(""); } sygvdx_hegvdx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvdx_hegvdx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvdx_hegvdx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvdx_hegvdx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_int ldz = argus.get("ldz", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideW", n); rocblas_stride stZ = argus.get("strideZ", ldz * n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stWRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stW : 0; rocblas_stride stZRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stZ : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); size_t size_Z = size_t(ldz) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t hashA = 0, hashB = 0, hashW = 0, hashZ = 0; size_t size_WRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_W : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_Z : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || (evect != rocblas_evect_none && ldz < n) || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvdx_hegvdx( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvdx_hegvdx( STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dNev(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hZ(size_Z, 1, bc); host_batch_vector hZRes(size_ZRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dZ(size_Z, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) sygvdx_hegvdx_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc, hA, hB, hNev, hNevRes, hW, hWRes, hZ, hZRes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashB, hashW, hashZ); // collect performance data if(argus.timing) sygvdx_hegvdx_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc, hA, hB, hNev, hW, hZ, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hZ(size_Z, 1, stZ, bc); host_strided_batch_vector hZRes(size_ZRes, 1, stZRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dZ(size_Z, 1, stZ, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) sygvdx_hegvdx_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc, hA, hB, hNev, hNevRes, hW, hWRes, hZ, hZRes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashB, hashW, hashZ); // collect performance data if(argus.timing) sygvdx_hegvdx_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, dNev, dW, stW, dZ, ldz, stZ, dInfo, bc, hA, hB, hNev, hW, hZ, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 8 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 8 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "strideW", "ldz", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, stW, ldz, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "strideA", "strideB", "vl", "vu", "il", "iu", "strideW", "ldz", "strideZ", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, stA, stB, vl, vu, il, iu, stW, ldz, stZ, bc); } else { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "ldz"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, ldz); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time", "gpu_time", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time", "gpu_time"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); if(argus.hash_check) { rocsolver_bench_output("hash(A)", "hash(B)", "hash(W)", "hash(Z)"); rocsolver_bench_output(ROCSOLVER_FORMAT_HASH(hashA), ROCSOLVER_FORMAT_HASH(hashB), ROCSOLVER_FORMAT_HASH(hashW), ROCSOLVER_FORMAT_HASH(hashZ)); rocsolver_bench_endl(); } } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVDX_HEGVDX(...) \ extern template void testing_sygvdx_hegvdx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVDX_HEGVDX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvdx_hegvdx_inplace.hpp000066400000000000000000001171221503202240500272130ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygvdx_hegvdx_inplace_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, rocblas_int* hNev, U dW, const rocblas_stride stW, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, nullptr, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, rocblas_eform(0), evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, rocblas_evect_tridiagonal, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, rocblas_erange(0), uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, (U) nullptr, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNev, dW, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, hNev, (U) nullptr, stW, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvdx_hegvdx_inplace_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_upper; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations host_strided_batch_vector hNev(1, 1, 1, 1); device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdx_hegvdx_inplace_checkBadArgs( handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dInfo.data(), bc); } else { // memory allocations host_strided_batch_vector hNev(1, 1, 1, 1); device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvdx_hegvdx_inplace_checkBadArgs( handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNev.data(), dW.data(), stW, dInfo.data(), bc); } } template void sygvdx_hegvdx_inplace_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with a reduced matrix M for the standard equivalent problem // with spectrum in a desired range (-20, 20). Then we construct the generalized pair // (A, B) from there. for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; U[b][j + i * ldu] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // form B = U' U T one = T(1); cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hB[b], ldb); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvdx_hegvdx_inplace_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vh& hNevRes, Ud& dW, const rocblas_stride stW, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Vh& hNev, Uh& hW, Uh& hWRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); std::vector Z(ldz * n); std::vector ifail(n); // input data initialization sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; } // Check number of returned eigenvalues for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hNev[b][0], hNevRes[b][0]) << "where b = " << b; if(hNev[b][0] != hNevRes[b][0]) *max_err += 1; } double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfo[b][0] == 0) { err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfo[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, hNev[b][0], alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < hNev[b][0]; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, hNev[b][0], lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvdx_hegvdx_inplace_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vh& hNevRes, Ud& dW, const rocblas_stride stW, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Uh& hW, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; int ldz = n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); std::vector Z(ldz * n); std::vector ifail(n); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], Z.data(), ldz, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvdx_hegvdx_inplace_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvdx_hegvdx_inplace(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideW", n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stWRes = (argus.unit_check || argus.norm_check) ? stW : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvdx_hegvdx_inplace(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvdx_hegvdx_inplace_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hARes, hB, hNev, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvdx_hegvdx_inplace_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hB, hNev, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvdx_hegvdx_inplace( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, hNevRes.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvdx_hegvdx_inplace_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hARes, hB, hNev, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvdx_hegvdx_inplace_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, hNevRes, dW, stW, dInfo, bc, hA, hB, hNev, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 3 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 3 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol, stW, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "strideA", "strideB", "vl", "vu", "il", "iu", "abstol", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, stA, stB, vl, vu, il, iu, abstol, stW, bc); } else { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time", "gpu_time", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time", "gpu_time"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvj_hegvj.cpp000066400000000000000000000033301503202240500251420ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sygvj_hegvj.hpp" #define TESTING_SYGVJ_HEGVJ(...) template void testing_sygvj_hegvj<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVJ_HEGVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvj_hegvj.hpp000066400000000000000000001076021503202240500251560ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygvj_hegvj_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const SS abstol, S dResidual, const rocblas_int max_sweeps, U dSweeps, S dW, const rocblas_stride stW, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, nullptr, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, rocblas_eform(0), evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, rocblas_evect(0), uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, rocblas_evect_tridiagonal, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, (S) nullptr, max_sweeps, dSweeps, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, (U) nullptr, dW, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, (S) nullptr, stW, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, (S) nullptr, max_sweeps, (U) nullptr, dW, stW, (U) nullptr, 0), rocblas_status_success); } template void testing_sygvj_hegvj_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_stride stE = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_none; rocblas_fill uplo = rocblas_fill_upper; S abstol = 0; rocblas_int max_sweeps = 100; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvj_hegvj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dResidual(1, 1, 1, 1); device_strided_batch_vector dSweeps(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvj_hegvj_checkBadArgs(handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); } } template void sygvj_hegvj_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_init(hA, true); rocblas_init(hB, false); for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 400; hB[b][i + j * ldb] = std::real(hB[b][i + j * ldb]) + 400; } else { hA[b][i + j * lda] -= 4; } } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvj_hegvj_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Th& hB, Sh& hResidualRes, Ih& hSweepsRes, Sh& hW, Sh& hWRes, Ih& hInfo, Ih& hInfoRes, double* max_err, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; S atol = (abstol <= 0) ? get_epsilon() : abstol; rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); CHECK_HIP_ERROR(hResidualRes.transfer_from(dResidual)); CHECK_HIP_ERROR(hSweepsRes.transfer_from(dSweeps)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) CHECK_HIP_ERROR(hARes.transfer_from(dA)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hW[b], work.data(), lwork, rwork.data(), hInfo[b]); } // (We expect the used input matrices to always converge) // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; } // Also check validity of residual for(rocblas_int b = 0; b < bc; ++b) if(hInfoRes[b][0] == 0) { EXPECT_GE(hResidualRes[b][0], 0) << "where b = " << b; if(hResidualRes[b][0] < 0) *max_err += 1; } // Also check validity of sweeps for(rocblas_int b = 0; b < bc; ++b) if(hInfoRes[b][0] == 0) { EXPECT_GE(hSweepsRes[b][0], 0) << "where b = " << b; EXPECT_LE(hSweepsRes[b][0], max_sweeps) << "where b = " << b; if(hSweepsRes[b][0] < 0 || hSweepsRes[b][0] > max_sweeps) *max_err += 1; } double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfoRes[b][0] == 0) { err = norm_error('F', 1, n, 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfoRes[b][0] == 0) { T alpha = 1; T beta = 0; // hARes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, n, alpha, B[b], ldb, hARes[b], lda, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hARes[b] + j * lda, 1, beta, hA[b] + j * lda, 1); } // move B*x into hARes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < n; j++) hARes[b][i + j * lda] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < n; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } } template void sygvj_hegvj_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S abstol, Sd& dResidual, const rocblas_int max_sweeps, Id& dSweeps, Sd& dW, const rocblas_stride stW, Id& dInfo, const rocblas_int bc, Th& hA, Th& hB, Sh& hW, Ih& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; rocblas_int lwork = (COMPLEX ? 2 * n - 1 : 3 * n - 1); rocblas_int lrwork = (COMPLEX ? 3 * n - 2 : 0); std::vector work(lwork); std::vector rwork(lrwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); if(!perf) { sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygv_hegv(itype, evect, uplo, n, hA[b], lda, hB[b], ldb, hW[b], work.data(), lwork, rwork.data(), hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvj_hegvj_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvj_hegvj(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideD", n); S abstol = S(argus.get("abstol", 0)); rocblas_int max_sweeps = argus.get("max_sweeps", 100); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stWRes = (argus.unit_check || argus.norm_check) ? stW : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_WRes = (argus.unit_check || argus.norm_check) ? size_W : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvj_hegvj( STRIDED, handle, itype, evect, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, abstol, (S*)nullptr, max_sweeps, (rocblas_int*)nullptr, (S*)nullptr, stW, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hResidualRes(1, 1, 1, bc); host_strided_batch_vector hSweepsRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); // device device_strided_batch_vector dResidual(1, 1, 1, bc); device_strided_batch_vector dSweeps(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); CHECK_HIP_ERROR(dResidual.memcheck()); CHECK_HIP_ERROR(dSweeps.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_batch_vector hB(size_B, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvj_hegvj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hB, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvj_hegvj_getPerfData(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hB, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS( rocsolver_sygvj_hegvj(STRIDED, handle, itype, evect, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, abstol, dResidual.data(), max_sweeps, dSweeps.data(), dW.data(), stW, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sygvj_hegvj_getError(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hARes, hB, hResidualRes, hSweepsRes, hW, hWRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sygvj_hegvj_getPerfData(handle, itype, evect, uplo, n, dA, lda, stA, dB, ldb, stB, abstol, dResidual, max_sweeps, dSweeps, dW, stW, dInfo, bc, hA, hB, hW, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, abstol, max_sweeps, stW, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "strideA", "strideB", "abstol", "max_sweeps", "strideW", "batch_c"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, stA, stB, abstol, max_sweeps, stW, bc); } else { rocsolver_bench_output("itype", "evect", "uplo", "n", "lda", "ldb", "abstol", "max_sweeps"); rocsolver_bench_output(itypeC, evectC, uploC, n, lda, ldb, abstol, max_sweeps); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVJ_HEGVJ(...) \ extern template void testing_sygvj_hegvj<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVJ_HEGVJ, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvx_hegvx.cpp000066400000000000000000000033301503202240500251760ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sygvx_hegvx.hpp" #define TESTING_SYGVX_HEGVX(...) template void testing_sygvx_hegvx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYGVX_HEGVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sygvx_hegvx.hpp000066400000000000000000001337231503202240500252150ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sygvx_hegvx_checkBadArgs(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, T dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, rocblas_int* dNev, U dW, const rocblas_stride stW, T dZ, const rocblas_int ldz, const rocblas_stride stZ, rocblas_int* dIfail, const rocblas_stride stF, rocblas_int* dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, nullptr, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, rocblas_eform(0), evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, rocblas_evect_tridiagonal, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, rocblas_erange(0), uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, rocblas_fill_full, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T) nullptr, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, (U) nullptr, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, (T) nullptr, ldz, stZ, dIfail, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, (rocblas_int*)nullptr, stF, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, 0, (T) nullptr, lda, stA, (T) nullptr, ldb, stB, vl, vu, il, iu, abstol, dNev, (U) nullptr, stW, (T) nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, dW, stW, dZ, ldz, stZ, dIfail, stF, (rocblas_int*)nullptr, 0), rocblas_status_success); } template void testing_sygvx_hegvx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_int ldb = 1; rocblas_int ldz = 1; rocblas_stride stA = 1; rocblas_stride stB = 1; rocblas_stride stW = 1; rocblas_stride stZ = 1; rocblas_stride stF = 1; rocblas_int bc = 1; rocblas_eform itype = rocblas_eform_ax; rocblas_evect evect = rocblas_evect_original; rocblas_erange erange = rocblas_erange_value; rocblas_fill uplo = rocblas_fill_upper; S vl = 0.0; S vu = 1.0; rocblas_int il = 0; rocblas_int iu = 0; S abstol = 0; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_batch_vector dB(1, 1, 1); device_batch_vector dZ(1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvx_hegvx_checkBadArgs(handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dB(1, 1, 1, 1); device_strided_batch_vector dZ(1, 1, 1, 1); device_strided_batch_vector dW(1, 1, 1, 1); device_strided_batch_vector dNev(1, 1, 1, 1); device_strided_batch_vector dIfail(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dB.memcheck()); CHECK_HIP_ERROR(dZ.memcheck()); CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dNev.memcheck()); CHECK_HIP_ERROR(dIfail.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sygvx_hegvx_checkBadArgs(handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc); } } template void sygvx_hegvx_initData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const rocblas_int bc, Th& hA, Th& hB, host_strided_batch_vector& A, host_strided_batch_vector& B, const bool test, const bool singular) { if(CPU) { rocblas_int info; rocblas_int ldu = n; host_strided_batch_vector U(n * n, 1, n * n, bc); rocblas_init(hA, true); rocblas_init(U, true); for(rocblas_int b = 0; b < bc; ++b) { // for testing purposes, we start with a reduced matrix M for the standard equivalent problem // with spectrum in a desired range (-20, 20). Then we construct the generalized pair // (A, B) from there. for(rocblas_int i = 0; i < n; i++) { // scale matrices and set hA = M (symmetric/hermitian), hB = U (upper triangular) for(rocblas_int j = i; j < n; j++) { if(i == j) { hA[b][i + j * lda] = std::real(hA[b][i + j * lda]) + 10; U[b][i + j * ldu] = std::real(U[b][i + j * ldu]) / 100 + 1; hB[b][i + j * ldb] = U[b][i + j * ldu]; } else { if(j == i + 1) { hA[b][i + j * lda] = (hA[b][i + j * lda] - 5) / 10; hA[b][j + i * lda] = sconj(hA[b][i + j * lda]); } else hA[b][j + i * lda] = hA[b][i + j * lda] = 0; U[b][i + j * ldu] = (U[b][i + j * ldu] - 5) / 100; hB[b][i + j * ldb] = U[b][i + j * ldu]; hB[b][j + i * ldb] = 0; U[b][j + i * ldu] = 0; } } if(i == n / 4 || i == n / 2 || i == n - 1 || i == n / 7 || i == n / 5 || i == n / 3) hA[b][i + i * lda] *= -1; } // form B = U' U T one = T(1); cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hB[b], ldb); if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // make some matrices B not positive definite // always the same elements for debugging purposes // the algorithm must detect the lower order of the principal minors <= 0 // in those matrices in the batch that are non positive definite rocblas_int i = n / 4 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n / 2 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; i = n - 1 + b; i -= (i / n) * n; hB[b][i + i * ldb] = 0; } if(itype == rocblas_eform_ax) { // form A = U' M U cpu_trmm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trmm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } else { // form A = inv(U) M inv(U') cpu_trsm(rocblas_side_left, rocblas_fill_upper, rocblas_operation_none, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); cpu_trsm(rocblas_side_right, rocblas_fill_upper, rocblas_operation_conjugate_transpose, rocblas_diagonal_non_unit, n, n, one, U[b], ldu, hA[b], lda); } // store A and B for testing purposes if(test && evect != rocblas_evect_none) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(itype != rocblas_eform_bax) { A[b][i + j * lda] = hA[b][i + j * lda]; B[b][i + j * ldb] = hB[b][i + j * ldb]; } else { A[b][i + j * lda] = hB[b][i + j * ldb]; B[b][i + j * ldb] = hA[b][i + j * lda]; } } } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); CHECK_HIP_ERROR(dB.transfer_from(hB)); } } template void sygvx_hegvx_getError(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vd& dNev, Ud& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Vd& dIfail, const rocblas_stride stF, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Vh& hNevRes, Uh& hW, Uh& hWRes, Th& hZ, Th& hZRes, Vh& hIfail, Vh& hIfailRes, Vh& hInfo, Vh& hInfoRes, double* max_err, const bool singular, size_t& hashA, size_t& hashB, size_t& hashW, size_t& hashZ) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(lda * n, 1, lda * n, bc); host_strided_batch_vector B(ldb * n, 1, ldb * n, bc); // input data initialization sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, true, singular); // hash inputs hashA = deterministic_hash(hA, bc); hashB = deterministic_hash(hB, bc); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc)); CHECK_HIP_ERROR(hNevRes.transfer_from(dNev)); CHECK_HIP_ERROR(hWRes.transfer_from(dW)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); if(evect != rocblas_evect_none) { CHECK_HIP_ERROR(hZRes.transfer_from(dZ)); CHECK_HIP_ERROR(hIfailRes.transfer_from(dIfail)); } // hash outputs hashW = deterministic_hash(hWRes, bc); if(evect != rocblas_evect_none) hashZ = deterministic_hash(hZRes, bc); // CPU lapack // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hInfo[b]); } // (We expect the used input matrices to always converge. Testing // implicitly the equivalent non-converged matrix is very complicated and it boils // down to essentially run the algorithm again and until convergence is achieved. // We do test with indefinite matrices B). // check info for non-convergence and/or positive-definiteness *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) *max_err += 1; } // Check number of returned eigenvalues for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hNev[b][0], hNevRes[b][0]) << "where b = " << b; if(hNev[b][0] != hNevRes[b][0]) *max_err += 1; } double err; for(rocblas_int b = 0; b < bc; ++b) { if(evect == rocblas_evect_none) { // only eigenvalues needed; can compare with LAPACK // error is ||hW - hWRes|| / ||hW|| // using frobenius norm if(hInfo[b][0] == 0) { err = norm_error('F', 1, hNev[b][0], 1, hW[b], hWRes[b]); *max_err = err > *max_err ? err : *max_err; } } else { // both eigenvalues and eigenvectors needed; need to implicitly test // eigenvectors due to non-uniqueness of eigenvectors under scaling if(hInfo[b][0] == 0) { // check ifail err = 0; for(int j = 0; j < hNev[b][0]; j++) { EXPECT_EQ(hIfailRes[b][j], 0) << "where b = " << b << ", j = " << j; if(hIfailRes[b][j] != 0) err++; } *max_err = err > *max_err ? err : *max_err; T alpha = 1; T beta = 0; // hZRes contains eigenvectors x // compute B*x (or A*x) and store in hB cpu_symm_hemm(rocblas_side_left, uplo, n, hNev[b][0], alpha, B[b], ldb, hZRes[b], ldz, beta, hB[b], ldb); if(itype == rocblas_eform_ax) { // problem is A*x = (lambda)*B*x // compute (1/lambda)*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hZRes[b] + j * ldz, 1, beta, hA[b] + j * lda, 1); } // move B*x into hZRes for(rocblas_int i = 0; i < n; i++) for(rocblas_int j = 0; j < hNev[b][0]; j++) hZRes[b][i + j * ldz] = hB[b][i + j * ldb]; } else { // problem is A*B*x = (lambda)*x or B*A*x = (lambda)*x // compute (1/lambda)*A*B*x or (1/lambda)*B*A*x and store in hA for(int j = 0; j < hNev[b][0]; j++) { alpha = T(1) / hWRes[b][j]; cpu_symv_hemv(uplo, n, alpha, A[b], lda, hB[b] + j * ldb, 1, beta, hA[b] + j * lda, 1); } } // error is ||hA - hZRes|| / ||hA|| // using frobenius norm err = norm_error('F', n, hNev[b][0], lda, hA[b], hZRes[b], ldz); *max_err = err > *max_err ? err : *max_err; } else if(hInfo[b][0] <= n) { // check ifail err = 0; for(int j = 0; j < hInfo[b][0]; j++) { EXPECT_NE(hIfailRes[b][j], 0) << "where b = " << b << ", j = " << j; if(hIfailRes[b][j] == 0) err++; } *max_err = err > *max_err ? err : *max_err; } } } } template void sygvx_hegvx_getPerfData(const rocblas_handle handle, const rocblas_eform itype, const rocblas_evect evect, const rocblas_erange erange, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Td& dB, const rocblas_int ldb, const rocblas_stride stB, const S vl, const S vu, const rocblas_int il, const rocblas_int iu, const S abstol, Vd& dNev, Ud& dW, const rocblas_stride stW, Td& dZ, const rocblas_int ldz, const rocblas_stride stZ, Vd& dIfail, const rocblas_stride stF, Vd& dInfo, const rocblas_int bc, Th& hA, Th& hB, Vh& hNev, Uh& hW, Th& hZ, Vh& hIfail, Vh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { constexpr bool COMPLEX = rocblas_is_complex; int lwork = (COMPLEX ? 2 * n : 8 * n); int lrwork = (COMPLEX ? 7 * n : 0); int liwork = 5 * n; std::vector work(lwork); std::vector rwork(lrwork); std::vector iwork(liwork); host_strided_batch_vector A(1, 1, 1, 1); host_strided_batch_vector B(1, 1, 1, 1); // abstol = 0 ensures max accuracy in rocsolver; for lapack we should use 2*safemin S atol = (abstol == 0) ? 2 * get_safemin() : abstol; if(!perf) { sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_sygvx_hegvx(itype, evect, erange, uplo, n, hA[b], lda, hB[b], ldb, vl, vu, il, iu, atol, hNev[b], hW[b], hZ[b], ldz, work.data(), lwork, rwork.data(), iwork.data(), hIfail[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); CHECK_ROCBLAS_ERROR(rocsolver_sygvx_hegvx( STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sygvx_hegvx_initData(handle, itype, evect, n, dA, lda, stA, dB, ldb, stB, bc, hA, hB, A, B, false, singular); start = get_time_us_sync(stream); rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sygvx_hegvx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char itypeC = argus.get("itype"); char evectC = argus.get("evect"); char erangeC = argus.get("erange"); char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_int ldb = argus.get("ldb", n); rocblas_int ldz = argus.get("ldz", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stB = argus.get("strideB", ldb * n); rocblas_stride stW = argus.get("strideW", n); rocblas_stride stF = argus.get("strideF", n); rocblas_stride stZ = argus.get("strideZ", ldz * n); S vl = S(argus.get("vl", 0)); S vu = S(argus.get("vu", erangeC == 'V' ? 1 : 0)); rocblas_int il = argus.get("il", erangeC == 'I' ? 1 : 0); rocblas_int iu = argus.get("iu", erangeC == 'I' ? 1 : 0); S abstol = S(argus.get("abstol", 0)); rocblas_eform itype = char2rocblas_eform(itypeC); rocblas_evect evect = char2rocblas_evect(evectC); rocblas_erange erange = char2rocblas_erange(erangeC); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stWRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stW : 0; rocblas_stride stZRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stZ : 0; rocblas_stride stFRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? stF : 0; // check non-supported values if(uplo == rocblas_fill_full || evect == rocblas_evect_tridiagonal) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_B = size_t(ldb) * n; size_t size_W = size_t(n); size_t size_Z = size_t(ldz) * n; size_t size_ifail = size_W; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t hashA = 0, hashB = 0, hashW = 0, hashZ = 0; size_t size_WRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_W : 0; size_t size_ZRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_Z : 0; size_t size_ifailRes = (argus.unit_check || argus.norm_check || argus.hash_check) ? size_ifail : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || ldb < n || (evect != rocblas_evect_none && ldz < n) || bc < 0 || (erange == rocblas_erange_value && vl >= vu) || (erange == rocblas_erange_index && (il < 1 || iu < 0)) || (erange == rocblas_erange_index && (iu > n || (n > 0 && il > iu)))); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS( rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sygvx_hegvx( STRIDED, handle, itype, evect, erange, uplo, n, (T* const*)nullptr, lda, stA, (T* const*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T* const*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sygvx_hegvx( STRIDED, handle, itype, evect, erange, uplo, n, (T*)nullptr, lda, stA, (T*)nullptr, ldb, stB, vl, vu, il, iu, abstol, (rocblas_int*)nullptr, (S*)nullptr, stW, (T*)nullptr, ldz, stZ, (rocblas_int*)nullptr, stF, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hNev(1, 1, 1, bc); host_strided_batch_vector hNevRes(1, 1, 1, bc); host_strided_batch_vector hW(size_W, 1, stW, bc); host_strided_batch_vector hWRes(size_WRes, 1, stWRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); host_strided_batch_vector hIfail(size_ifail, 1, stF, bc); host_strided_batch_vector hIfailRes(size_ifailRes, 1, stFRes, bc); // device device_strided_batch_vector dNev(1, 1, 1, bc); device_strided_batch_vector dW(size_W, 1, stW, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); device_strided_batch_vector dIfail(size_ifail, 1, stF, bc); CHECK_HIP_ERROR(dNev.memcheck()); if(size_W) CHECK_HIP_ERROR(dW.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_ifail) CHECK_HIP_ERROR(dIfail.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hB(size_B, 1, bc); host_batch_vector hZ(size_Z, 1, bc); host_batch_vector hZRes(size_ZRes, 1, bc); device_batch_vector dA(size_A, 1, bc); device_batch_vector dB(size_B, 1, bc); device_batch_vector dZ(size_Z, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) sygvx_hegvx_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hNevRes, hW, hWRes, hZ, hZRes, hIfail, hIfailRes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashB, hashW, hashZ); // collect performance data if(argus.timing) sygvx_hegvx_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hW, hZ, hIfail, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hB(size_B, 1, stB, bc); host_strided_batch_vector hZ(size_Z, 1, stZ, bc); host_strided_batch_vector hZRes(size_ZRes, 1, stZRes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dB(size_B, 1, stB, bc); device_strided_batch_vector dZ(size_Z, 1, stZ, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); if(size_B) CHECK_HIP_ERROR(dB.memcheck()); if(size_Z) CHECK_HIP_ERROR(dZ.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sygvx_hegvx(STRIDED, handle, itype, evect, erange, uplo, n, dA.data(), lda, stA, dB.data(), ldb, stB, vl, vu, il, iu, abstol, dNev.data(), dW.data(), stW, dZ.data(), ldz, stZ, dIfail.data(), stF, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check || argus.hash_check) sygvx_hegvx_getError( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hNevRes, hW, hWRes, hZ, hZRes, hIfail, hIfailRes, hInfo, hInfoRes, &max_error, argus.singular, hashA, hashB, hashW, hashZ); // collect performance data if(argus.timing) sygvx_hegvx_getPerfData( handle, itype, evect, erange, uplo, n, dA, lda, stA, dB, ldb, stB, vl, vu, il, iu, abstol, dNev, dW, stW, dZ, ldz, stZ, dIfail, stF, dInfo, bc, hA, hB, hNev, hW, hZ, hIfail, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using 2 * n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, 2 * n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideF", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol, stW, ldz, stF, bc); } else if(STRIDED) { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "strideA", "strideB", "vl", "vu", "il", "iu", "abstol", "strideW", "ldz", "strideZ", "strideF", "batch_c"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, stA, stB, vl, vu, il, iu, abstol, stW, ldz, stZ, stF, bc); } else { rocsolver_bench_output("itype", "evect", "erange", "uplo", "n", "lda", "ldb", "vl", "vu", "il", "iu", "abstol", "ldz"); rocsolver_bench_output(itypeC, evectC, erangeC, uploC, n, lda, ldb, vl, vu, il, iu, abstol, ldz); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time", "gpu_time", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time", "gpu_time"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); if(argus.hash_check) { rocsolver_bench_output("hash(A)", "hash(B)", "hash(W)", "hash(Z)"); rocsolver_bench_output(ROCSOLVER_FORMAT_HASH(hashA), ROCSOLVER_FORMAT_HASH(hashB), ROCSOLVER_FORMAT_HASH(hashW), ROCSOLVER_FORMAT_HASH(hashZ)); rocsolver_bench_endl(); } } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYGVX_HEGVX(...) \ extern template void testing_sygvx_hegvx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYGVX_HEGVX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sytf2_sytrf.cpp000066400000000000000000000034411503202240500251160ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sytf2_sytrf.hpp" #define TESTING_SYTF2_SYTRF(...) template void testing_sytf2_sytrf<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYTF2_SYTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sytf2_sytrf.hpp000066400000000000000000000605151503202240500251300ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sytf2_sytrf_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dIpiv, const rocblas_stride stP, U dinfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, nullptr, uplo, n, dA, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, rocblas_fill_full, n, dA, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, dinfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T) nullptr, lda, stA, dIpiv, stP, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, 0, (T) nullptr, lda, stA, (U) nullptr, stP, dinfo, bc), rocblas_status_success); if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, (U) nullptr, 0), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA, lda, stA, dIpiv, stP, dinfo, 0), rocblas_status_success); } template void testing_sytf2_sytrf_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sytf2_sytrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dIpiv(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dIpiv.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments sytf2_sytrf_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); } } template void sytf2_sytrf_initData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } // shuffle rows to test pivoting // always the same permuation for debugging purposes for(rocblas_int i = 0; i < n / 2; i++) { for(rocblas_int j = 0; j < n; j++) { tmp = hA[b][i + j * lda]; hA[b][i + j * lda] = hA[b][n - 1 - i + j * lda]; hA[b][n - 1 - i + j * lda] = tmp; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int j = n / 4 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[b][i + j * lda] = 0; hA[b][j + i * lda] = 0; } j = n / 2 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[b][i + j * lda] = 0; hA[b][j + i * lda] = 0; } j = n - 1 + b; j -= (j / n) * n; for(rocblas_int i = 0; i < n; i++) { hA[b][i + j * lda] = 0; hA[b][j + i * lda] = 0; } } } } if(GPU) { // now copy data to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void sytf2_sytrf_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hIpiv, Uh& hIpivRes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { int lwork = (SYTRF ? 64 * n : 0); std::vector work(lwork); // input data initialization sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hIpivRes.transfer_from(dIpiv)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { SYTRF ? cpu_sytrf(uplo, n, hA[b], lda, hIpiv[b], work.data(), lwork, hInfo[b]) : cpu_sytf2(uplo, n, hA[b], lda, hIpiv[b], hInfo[b]); } // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; // also check pivoting (count the number of incorrect pivots) err = 0; for(rocblas_int i = 0; i < n; ++i) { EXPECT_EQ(hIpiv[b][i], hIpivRes[b][i]) << "where b = " << b << ", i = " << i; if(hIpiv[b][i] != hIpivRes[b][i]) err++; } *max_err = err > *max_err ? err : *max_err; } // also check info err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; } template void sytf2_sytrf_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dIpiv, const rocblas_stride stP, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hIpiv, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { int lwork = (SYTRF ? 64 * n : 0); std::vector work(lwork); if(!perf) { sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { SYTRF ? cpu_sytrf(uplo, n, hA[b], lda, hIpiv[b], work.data(), lwork, hInfo[b]) : cpu_sytf2(uplo, n, hA[b], lda, hIpiv[b], hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); // cold calls for(int iter = 0; iter < 2; iter++) { sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); CHECK_ROCBLAS_ERROR(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sytf2_sytrf_initData(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, singular); start = get_time_us_sync(stream); rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sytf2_sytrf(Arguments& argus) { // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stP = argus.get("strideP", n); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; rocblas_stride stPRes = (argus.unit_check || argus.norm_check) ? stP : 0; // check non-supported values if(uplo == rocblas_fill_full) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; size_t size_P = size_t(n); double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; size_t size_PRes = (argus.unit_check || argus.norm_check) ? size_P : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS( rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY( rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, stP, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytf2_sytrf_getError(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sytf2_sytrf_getPerfData( handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hIpiv(size_P, 1, stP, bc); host_strided_batch_vector hIpivRes(size_PRes, 1, stPRes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dIpiv(size_P, 1, stP, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); if(size_P) CHECK_HIP_ERROR(dIpiv.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytf2_sytrf(STRIDED, SYTRF, handle, uplo, n, dA.data(), lda, stA, dIpiv.data(), stP, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytf2_sytrf_getError(handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hARes, hIpiv, hIpivRes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) sytf2_sytrf_getPerfData( handle, uplo, n, dA, lda, stA, dIpiv, stP, dInfo, bc, hA, hIpiv, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stP, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, stP, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYTF2_SYTRF(...) \ extern template void testing_sytf2_sytrf<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYTF2_SYTRF, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sytxx_hetxx.cpp000066400000000000000000000034411503202240500252370ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_sytxx_hetxx.hpp" #define TESTING_SYTXX_HETXX(...) template void testing_sytxx_hetxx<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_SYTXX_HETXX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_sytxx_hetxx.hpp000066400000000000000000000631351503202240500252520ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void sytxx_hetxx_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, S dD, const rocblas_stride stD, S dE, const rocblas_stride stE, U dTau, const rocblas_stride stP, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, nullptr, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, rocblas_fill_full, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T) nullptr, lda, stA, dD, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, (S) nullptr, stD, dE, stE, dTau, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, (S) nullptr, stE, dTau, stP, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, (U) nullptr, stP, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, 0, (T) nullptr, lda, stA, (S) nullptr, stD, (S) nullptr, stE, (U) nullptr, stP, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, 0), rocblas_status_success); } template void testing_sytxx_hetxx_bad_arg() { using S = decltype(std::real(T{})); // safe arguments rocblas_local_handle handle; rocblas_fill uplo = rocblas_fill_upper; rocblas_int n = 2; rocblas_int lda = 2; rocblas_stride stA = 1; rocblas_stride stD = 1; rocblas_stride stE = 1; rocblas_stride stP = 1; rocblas_int bc = 1; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTau(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTau.memcheck()); // check bad arguments sytxx_hetxx_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dD(1, 1, 1, 1); device_strided_batch_vector dE(1, 1, 1, 1); device_strided_batch_vector dTau(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dD.memcheck()); CHECK_HIP_ERROR(dE.memcheck()); CHECK_HIP_ERROR(dTau.memcheck()); // check bad arguments sytxx_hetxx_checkBadArgs(handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc); } } template , int> = 0> void sytxx_hetxx_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j || i == j + 1 || i == j - 1) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template , int> = 0> void sytxx_hetxx_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA) { if(CPU) { rocblas_init(hA, true); // scale A to avoid singularities for(rocblas_int b = 0; b < bc; ++b) { for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = hA[b][i + j * lda].real() + 400; else if(i == j + 1 || i == j - 1) hA[b][i + j * lda] += 400; else hA[b][i + j * lda] -= 4; } } } } if(GPU) { // now copy to the GPU CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void sytxx_hetxx_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTau, const rocblas_stride stP, const rocblas_int bc, Th& hA, Th& hARes, Sh& hD, Sh& hE, Uh& hTau, double* max_err) { constexpr bool COMPLEX = rocblas_is_complex; std::vector hW(32 * n); // input data initialization sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hTau.transfer_from(dTau)); // Reconstruct matrix A from the factorization for implicit testing // A = H(n-1)...H(2)H(1)*T*H(1)'H(2)'...H(n-1)' if upper // A = H(1)H(2)...H(n-1)*T*H(n-1)'...H(2)'H(1)' if lower std::vector v(n); for(rocblas_int b = 0; b < bc; ++b) { T* a = hARes[b]; T* t = hTau[b]; if(uplo == rocblas_fill_lower) { for(rocblas_int i = 0; i < n - 2; ++i) a[i + (n - 1) * lda] = 0; a[(n - 2) + (n - 1) * lda] = a[(n - 1) + (n - 2) * lda]; // for each column for(rocblas_int j = n - 2; j >= 0; --j) { // prepare T and v for(rocblas_int i = 0; i < j - 1; ++i) a[i + j * lda] = 0; if(j > 0) a[(j - 1) + j * lda] = a[j + (j - 1) * lda]; for(rocblas_int i = j + 2; i < n; ++i) { v[i - j - 1] = a[i + j * lda]; a[i + j * lda] = 0; } v[0] = 1; // apply householder reflector cpu_larf(rocblas_side_left, n - 1 - j, n - j, v.data(), 1, t + j, a + (j + 1) + j * lda, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, t + j, 1); cpu_larf(rocblas_side_right, n - j, n - 1 - j, v.data(), 1, t + j, a + j + (j + 1) * lda, lda, hW.data()); } } else { a[1] = a[lda]; for(rocblas_int i = 2; i < n; ++i) a[i] = 0; // for each column for(rocblas_int j = 1; j <= n - 1; ++j) { // prepare T and v for(rocblas_int i = 0; i < j - 1; ++i) { v[i] = a[i + j * lda]; a[i + j * lda] = 0; } v[j - 1] = 1; if(j < n - 1) a[(j + 1) + j * lda] = a[j + (j + 1) * lda]; for(rocblas_int i = j + 2; i < n; ++i) a[i + j * lda] = 0; // apply householder reflector cpu_larf(rocblas_side_left, j, j + 1, v.data(), 1, t + j - 1, a, lda, hW.data()); if(COMPLEX) cpu_lacgv(1, t + j - 1, 1); cpu_larf(rocblas_side_right, j + 1, j, v.data(), 1, t + j - 1, a, lda, hW.data()); } } } // error is ||hA - hARes|| / ||hA|| // using frobenius norm double err; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { *max_err = (uplo == rocblas_fill_lower) ? norm_error_lowerTr('F', n, n, lda, hA[b], hARes[b]) : norm_error_upperTr('F', n, n, lda, hA[b], hARes[b]); } } template void sytxx_hetxx_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Sd& dD, const rocblas_stride stD, Sd& dE, const rocblas_stride stE, Ud& dTau, const rocblas_stride stP, const rocblas_int bc, Th& hA, Sh& hD, Sh& hE, Uh& hTau, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf) { std::vector hW(32 * n); if(!perf) { sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { SYTRD ? cpu_sytrd_hetrd(uplo, n, hA[b], lda, hD[b], hE[b], hTau[b], hW.data(), 32 * n) : cpu_sytd2_hetd2(uplo, n, hA[b], lda, hD[b], hE[b], hTau[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); // cold calls for(int iter = 0; iter < 2; iter++) { sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); CHECK_ROCBLAS_ERROR(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { sytxx_hetxx_initData(handle, n, dA, lda, bc, hA); start = get_time_us_sync(stream); rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_sytxx_hetxx(Arguments& argus) { using S = decltype(std::real(T{})); // get arguments rocblas_local_handle handle; char uploC = argus.get("uplo"); rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); rocblas_stride stD = argus.get("strideD", n); rocblas_stride stE = argus.get("strideE", n - 1); rocblas_stride stP = argus.get("strideP", n - 1); rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = lda * n; size_t size_D = n; size_t size_E = n - 1; size_t size_tau = n - 1; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T* const*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc)); else CHECK_ALLOC_QUERY(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, (T*)nullptr, lda, stA, (S*)nullptr, stD, (S*)nullptr, stE, (T*)nullptr, stP, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } // memory allocations (all cases) // host host_strided_batch_vector hD(size_D, 1, stD, bc); host_strided_batch_vector hE(size_E, 1, stE, bc); host_strided_batch_vector hTau(size_tau, 1, stP, bc); // device device_strided_batch_vector dD(size_D, 1, stD, bc); device_strided_batch_vector dE(size_E, 1, stE, bc); device_strided_batch_vector dTau(size_tau, 1, stP, bc); if(size_D) CHECK_HIP_ERROR(dD.memcheck()); if(size_E) CHECK_HIP_ERROR(dE.memcheck()); if(size_tau) CHECK_HIP_ERROR(dTau.memcheck()); if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); device_batch_vector dA(size_A, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytxx_hetxx_getError(handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hARes, hD, hE, hTau, &max_error); // collect performance data if(argus.timing && hot_calls > 0) sytxx_hetxx_getPerfData( handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hD, hE, hTau, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_sytxx_hetxx(STRIDED, SYTRD, handle, uplo, n, dA.data(), lda, stA, dD.data(), stD, dE.data(), stE, dTau.data(), stP, bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) sytxx_hetxx_getError(handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hARes, hD, hE, hTau, &max_error); // collect performance data if(argus.timing && hot_calls > 0) sytxx_hetxx_getPerfData( handle, uplo, n, dA, lda, stA, dD, stD, dE, stE, dTau, stP, bc, hA, hD, hE, hTau, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "n", "lda", "strideD", "strideE", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stD, stE, stP, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "n", "lda", "strideA", "strideD", "strideE", "strideP", "batch_c"); rocsolver_bench_output(uploC, n, lda, stA, stD, stE, stP, bc); } else { rocsolver_bench_output("uplo", "n", "lda"); rocsolver_bench_output(uploC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_SYTXX_HETXX(...) \ extern template void testing_sytxx_hetxx<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_SYTXX_HETXX, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_BLOCKED_VARIANT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_trtri.cpp000066400000000000000000000033001503202240500237560ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "testing_trtri.hpp" #define TESTING_TRTRI(...) template void testing_trtri<__VA_ARGS__>(Arguments&); INSTANTIATE(TESTING_TRTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/lapack/testing_trtri.hpp000066400000000000000000000454471503202240500240050ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include "common/misc/client_util.hpp" #include "common/misc/clientcommon.hpp" #include "common/misc/lapack_host_reference.hpp" #include "common/misc/norm.hpp" #include "common/misc/rocsolver.hpp" #include "common/misc/rocsolver_arguments.hpp" #include "common/misc/rocsolver_test.hpp" template void trtri_checkBadArgs(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_diagonal diag, const rocblas_int n, T dA, const rocblas_int lda, const rocblas_stride stA, U dInfo, const rocblas_int bc) { // handle EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, nullptr, uplo, diag, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_handle); // values EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, rocblas_fill_full, diag, n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_value); EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, rocblas_diagonal(0), n, dA, lda, stA, dInfo, bc), rocblas_status_invalid_value); // sizes (only check batch_count if applicable) if(STRIDED) EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA, lda, stA, dInfo, -1), rocblas_status_invalid_size); // pointers EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA, lda, stA, (U) nullptr, bc), rocblas_status_invalid_pointer); // quick return with invalid pointers EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, 0, (T) nullptr, lda, stA, dInfo, bc), rocblas_status_success); // quick return with zero batch_count if applicable if(STRIDED) EXPECT_ROCBLAS_STATUS( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA, lda, stA, (U) nullptr, 0), rocblas_status_success); } template void testing_trtri_bad_arg() { // safe arguments rocblas_local_handle handle; rocblas_int n = 1; rocblas_int lda = 1; rocblas_stride stA = 1; rocblas_int bc = 1; rocblas_diagonal diag = rocblas_diagonal_non_unit; rocblas_fill uplo = rocblas_fill_upper; if(BATCHED) { // memory allocations device_batch_vector dA(1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments trtri_checkBadArgs(handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc); } else { // memory allocations device_strided_batch_vector dA(1, 1, 1, 1); device_strided_batch_vector dInfo(1, 1, 1, 1); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check bad arguments trtri_checkBadArgs(handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc); } } template void trtri_initData(const rocblas_handle handle, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_int bc, Th& hA, const bool singular) { if(CPU) { T tmp; rocblas_init(hA, true); for(rocblas_int b = 0; b < bc; ++b) { // scale A to avoid singularities for(rocblas_int i = 0; i < n; i++) { for(rocblas_int j = 0; j < n; j++) { if(i == j) hA[b][i + j * lda] = hA[b][i + j * lda] / 10.0 + 1; else hA[b][i + j * lda] = (hA[b][i + j * lda] - 4) / 10.0; } } if(singular && (b == bc / 4 || b == bc / 2 || b == bc - 1)) { // add some singularities // always the same elements for debugging purposes // the algorithm must detect the first zero pivot in those // matrices in the batch that are singular rocblas_int i = n / 4 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n / 2 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; i = n - 1 + b; i -= (i / n) * n; hA[b][i + i * lda] = 0; } } } // now copy data to the GPU if(GPU) { CHECK_HIP_ERROR(dA.transfer_from(hA)); } } template void trtri_getError(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_diagonal diag, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Th& hARes, Uh& hInfo, Uh& hInfoRes, double* max_err, const bool singular) { // input data initialization trtri_initData(handle, n, dA, lda, bc, hA, singular); // execute computations // GPU lapack CHECK_ROCBLAS_ERROR( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc)); CHECK_HIP_ERROR(hARes.transfer_from(dA)); CHECK_HIP_ERROR(hInfoRes.transfer_from(dInfo)); // CPU lapack for(rocblas_int b = 0; b < bc; ++b) { cpu_trtri(uplo, diag, n, hA[b], lda, hInfo[b]); } // check info for singularities double err = 0; *max_err = 0; for(rocblas_int b = 0; b < bc; ++b) { EXPECT_EQ(hInfo[b][0], hInfoRes[b][0]) << "where b = " << b; if(hInfo[b][0] != hInfoRes[b][0]) err++; } *max_err += err; // error is ||hA - hARes|| / ||hA|| // (THIS DOES NOT ACCOUNT FOR NUMERICAL REPRODUCIBILITY ISSUES. // IT MIGHT BE REVISITED IN THE FUTURE) // using frobenius norm for(rocblas_int b = 0; b < bc; ++b) { if(hInfoRes[b][0] == 0) { err = norm_error('F', n, n, lda, hA[b], hARes[b]); *max_err = err > *max_err ? err : *max_err; } } } template void trtri_getPerfData(const rocblas_handle handle, const rocblas_fill uplo, const rocblas_diagonal diag, const rocblas_int n, Td& dA, const rocblas_int lda, const rocblas_stride stA, Ud& dInfo, const rocblas_int bc, Th& hA, Uh& hInfo, double* gpu_time_used, double* cpu_time_used, const rocblas_int hot_calls, const int profile, const bool profile_kernels, const bool perf, const bool singular) { if(!perf) { trtri_initData(handle, n, dA, lda, bc, hA, singular); // cpu-lapack performance (only if not in perf mode) *cpu_time_used = get_time_us_no_sync(); for(rocblas_int b = 0; b < bc; ++b) { cpu_trtri(uplo, diag, n, hA[b], lda, hInfo[b]); } *cpu_time_used = get_time_us_no_sync() - *cpu_time_used; } trtri_initData(handle, n, dA, lda, bc, hA, singular); // cold calls for(int iter = 0; iter < 2; iter++) { trtri_initData(handle, n, dA, lda, bc, hA, singular); CHECK_ROCBLAS_ERROR( rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc)); } // gpu-lapack performance hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); double start; if(profile > 0) { if(profile_kernels) rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile | rocblas_layer_mode_ex_log_kernel); else rocsolver_log_set_layer_mode(rocblas_layer_mode_log_profile); rocsolver_log_set_max_levels(profile); } for(rocblas_int iter = 0; iter < hot_calls; iter++) { trtri_initData(handle, n, dA, lda, bc, hA, singular); start = get_time_us_sync(stream); rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc); *gpu_time_used += get_time_us_sync(stream) - start; } *gpu_time_used /= hot_calls; } template void testing_trtri(Arguments& argus) { // get arguments rocblas_local_handle handle; rocblas_int n = argus.get("n"); rocblas_int lda = argus.get("lda", n); rocblas_stride stA = argus.get("strideA", lda * n); char uploC = argus.get("uplo"); rocblas_fill uplo = char2rocblas_fill(uploC); char diagC = argus.get("diag"); rocblas_diagonal diag = char2rocblas_diagonal(diagC); rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; rocblas_stride stARes = (argus.unit_check || argus.norm_check) ? stA : 0; // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); else EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_value); if(argus.timing) rocsolver_bench_inform(inform_invalid_args); return; } // determine sizes size_t size_A = size_t(lda) * n; double max_error = 0, gpu_time_used = 0, cpu_time_used = 0; size_t size_ARes = (argus.unit_check || argus.norm_check) ? size_A : 0; // check invalid sizes bool invalid_size = (n < 0 || lda < n || bc < 0); if(invalid_size) { if(BATCHED) EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); else EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc), rocblas_status_invalid_size); if(argus.timing) rocsolver_bench_inform(inform_invalid_size); return; } // memory size query is necessary if(argus.mem_query || !USE_ROCBLAS_REALLOC_ON_DEMAND) { CHECK_ROCBLAS_ERROR(rocblas_start_device_memory_size_query(handle)); if(BATCHED) CHECK_ALLOC_QUERY(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T* const*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); else CHECK_ALLOC_QUERY(rocsolver_trtri(STRIDED, handle, uplo, diag, n, (T*)nullptr, lda, stA, (rocblas_int*)nullptr, bc)); size_t size; CHECK_ROCBLAS_ERROR(rocblas_stop_device_memory_size_query(handle, &size)); if(argus.mem_query) { rocsolver_bench_inform(inform_mem_query, size); return; } CHECK_ROCBLAS_ERROR(rocblas_set_device_memory_size(handle, size)); } if(BATCHED) { // memory allocations host_batch_vector hA(size_A, 1, bc); host_batch_vector hARes(size_ARes, 1, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_batch_vector dA(size_A, 1, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) trtri_getError(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) trtri_getPerfData(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } else { // memory allocations host_strided_batch_vector hA(size_A, 1, stA, bc); host_strided_batch_vector hARes(size_ARes, 1, stARes, bc); host_strided_batch_vector hInfo(1, 1, 1, bc); host_strided_batch_vector hInfoRes(1, 1, 1, bc); device_strided_batch_vector dA(size_A, 1, stA, bc); device_strided_batch_vector dInfo(1, 1, 1, bc); if(size_A) CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dInfo.memcheck()); // check quick return if(n == 0 || bc == 0) { EXPECT_ROCBLAS_STATUS(rocsolver_trtri(STRIDED, handle, uplo, diag, n, dA.data(), lda, stA, dInfo.data(), bc), rocblas_status_success); if(argus.timing) rocsolver_bench_inform(inform_quick_return); return; } // check computations if(argus.unit_check || argus.norm_check) trtri_getError(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hARes, hInfo, hInfoRes, &max_error, argus.singular); // collect performance data if(argus.timing) trtri_getPerfData(handle, uplo, diag, n, dA, lda, stA, dInfo, bc, hA, hInfo, &gpu_time_used, &cpu_time_used, hot_calls, argus.profile, argus.profile_kernels, argus.perf, argus.singular); } // validate results for rocsolver-test // using n * machine_precision as tolerance if(argus.unit_check) ROCSOLVER_TEST_CHECK(T, max_error, n); // output results for rocsolver-bench if(argus.timing) { if(!argus.perf) { rocsolver_bench_header("Arguments:"); if(BATCHED) { rocsolver_bench_output("uplo", "diag", "n", "lda", "batch_c"); rocsolver_bench_output(uploC, diagC, n, lda, bc); } else if(STRIDED) { rocsolver_bench_output("uplo", "diag", "n", "lda", "strideA", "batch_c"); rocsolver_bench_output(uploC, diagC, n, lda, stA, bc); } else { rocsolver_bench_output("uplo", "diag", "n", "lda"); rocsolver_bench_output(uploC, diagC, n, lda); } rocsolver_bench_header("Results:"); if(argus.norm_check) { rocsolver_bench_output("cpu_time_us", "gpu_time_us", "error"); rocsolver_bench_output(cpu_time_used, gpu_time_used, max_error); } else { rocsolver_bench_output("cpu_time_us", "gpu_time_us"); rocsolver_bench_output(cpu_time_used, gpu_time_used); } rocsolver_bench_endl(); } else { if(argus.norm_check) rocsolver_bench_output(gpu_time_used, max_error); else rocsolver_bench_output(gpu_time_used); } } // ensure all arguments were consumed argus.validate_consumed(); } #define EXTERN_TESTING_TRTRI(...) extern template void testing_trtri<__VA_ARGS__>(Arguments&); INSTANTIATE(EXTERN_TESTING_TRTRI, FOREACH_MATRIX_DATA_LAYOUT, FOREACH_SCALAR_TYPE, APPLY_STAMP) rocSOLVER-rocm-6.4.3/clients/common/matrix_utils/000077500000000000000000000000001503202240500216465ustar00rootroot00000000000000rocSOLVER-rocm-6.4.3/clients/common/matrix_utils/host_matrix.cpp000066400000000000000000000030221503202240500247100ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #include "host_matrix.hpp" rocSOLVER-rocm-6.4.3/clients/common/matrix_utils/host_matrix.hpp000066400000000000000000001101261503202240500247210ustar00rootroot00000000000000/* ************************************************************************** * Copyright (C) 2018-2025 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * *************************************************************************/ #pragma once #include #include #include #include #include #include #include #include #include #include "matrix_interface.hpp" #include "matrix_utils_detail.hpp" namespace matxu { template class HostMatrix : public MatrixInterface { public: using T = typename MatrixInterface::T; using I = typename MatrixInterface::I; using S = typename MatrixInterface::S; static auto Wrap(T* in_data, I nrows, I ncols) noexcept -> std::unique_ptr> { if((nrows < 1) || (ncols < 1)) { return nullptr; } auto ptr = std::unique_ptr>( new(std::nothrow) HostMatrix(in_data, nrows, ncols)); if(ptr) { ptr->is_a_wrapper_ = true; } return ptr; } /* static auto Clone(T* in_data, I nrows, I ncols) noexcept -> std::unique_ptr> */ /* { */ /* if ((nrows < 1) || (ncols < 1)) */ /* { */ /* return nullptr; */ /* } */ /* auto ptr = std::unique_ptr>(new(std::nothrow) HostMatrix(nullptr, nrows, ncols)); */ /* ptr->data_ = ptr->cgc_.alloc_and_copy(ptr->size(), in_data); */ /* if (ptr->data_ == nullptr) */ /* { */ /* // If ptr->data_ was not initialized, then ptr does not point to a valid matrix */ /* ptr = nullptr; */ /* } */ /* return ptr; */ /* } */ template static auto Convert(S_* in_data, I nrows, I ncols) noexcept -> std::unique_ptr> { if((nrows < 1) || (ncols < 1)) { return nullptr; } auto ptr = std::unique_ptr>(new(std::nothrow) HostMatrix(nullptr, nrows, ncols)); ptr->data_ = ptr->cgc_.alloc(ptr->size()); if(ptr->data_ == nullptr) { // If ptr->data_ was not initialized, then ptr does not point to a valid matrix ptr = nullptr; } for(I i = 0; i < ptr->size(); ++i) { ptr->operator[](i) = T(in_data[i]); } return ptr; } template